diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-07-13 19:25:18 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-07-13 19:25:18 +0000 |
commit | ca089b24d48ef6fa8da2d0bb8c25bb802c4a95c0 (patch) | |
tree | 3a28a772df9b17aef34f49e3c727965ad28c0c93 /test | |
parent | 9df3605dea17e84f8183581f6103bd0c79e2a606 (diff) |
Notes
Diffstat (limited to 'test')
596 files changed, 80721 insertions, 12726 deletions
diff --git a/test/Analysis/BasicAA/unreachable-block.ll b/test/Analysis/BasicAA/unreachable-block.ll index 551d18e3e0fb3..d6c149f816618 100644 --- a/test/Analysis/BasicAA/unreachable-block.ll +++ b/test/Analysis/BasicAA/unreachable-block.ll @@ -1,4 +1,4 @@ -; RUN: opt -basicaa -aa-eval -disable-output < %s >& /dev/null +; RUN: opt -basicaa -aa-eval -disable-output < %s > /dev/null 2>&1 ; BasicAA shouldn't infinitely recurse on the use-def cycles in ; unreachable code. diff --git a/test/Analysis/CostModel/X86/slm-arith-costs.ll b/test/Analysis/CostModel/X86/slm-arith-costs.ll index 3673a5d9e0673..a767aa30b8ed1 100644 --- a/test/Analysis/CostModel/X86/slm-arith-costs.ll +++ b/test/Analysis/CostModel/X86/slm-arith-costs.ll @@ -3,6 +3,20 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" +define <2 x i64> @slm-costs_64_vector_add(<2 x i64> %a, <2 x i64> %b) { +entry: +; SLM: cost of 4 {{.*}} add <2 x i64> + %res = add <2 x i64> %a, %b + ret <2 x i64> %res +} + +define <2 x i64> @slm-costs_64_vector_sub(<2 x i64> %a, <2 x i64> %b) { +entry: +; SLM: cost of 4 {{.*}} sub <2 x i64> + %res = sub <2 x i64> %a, %b + ret <2 x i64> %res +} + ; 8bit mul define i8 @slm-costs_8_scalar_mul(i8 %a, i8 %b) { entry: @@ -13,7 +27,7 @@ entry: define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b) { entry: -; SLM: cost of 11 {{.*}} mul nsw <2 x i8> +; SLM: cost of 17 {{.*}} mul nsw <2 x i8> %res = mul nsw <2 x i8> %a, %b ret <2 x i8> %res } @@ -97,7 +111,7 @@ entry: define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b) { entry: -; SLM: cost of 11 {{.*}} mul nsw <2 x i16> +; SLM: cost of 17 {{.*}} mul nsw <2 x i16> %res = mul nsw <2 x i16> %a, %b ret <2 x i16> %res } @@ -181,7 +195,7 @@ entry: define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b) { entry: -; SLM: cost of 11 {{.*}} mul nsw <2 x i32> +; SLM: cost of 17 {{.*}} mul nsw <2 x i32> %res = mul nsw <2 x i32> %a, %b ret <2 x i32> %res } @@ -217,28 +231,28 @@ entry: define <2 x i64> @slm-costs_64_v2_mul(<2 x i64> %a, <2 x i64> %b) { entry: -; SLM: cost of 11 {{.*}} mul nsw <2 x i64> +; SLM: cost of 17 {{.*}} mul nsw <2 x i64> %res = mul nsw <2 x i64> %a, %b ret <2 x i64> %res } define <4 x i64> @slm-costs_64_v4_mul(<4 x i64> %a, <4 x i64> %b) { entry: -; SLM: cost of 22 {{.*}} mul nsw <4 x i64> +; SLM: cost of 34 {{.*}} mul nsw <4 x i64> %res = mul nsw <4 x i64> %a, %b ret <4 x i64> %res } define <8 x i64> @slm-costs_64_v8_mul(<8 x i64> %a, <8 x i64> %b) { entry: -; SLM: cost of 44 {{.*}} mul nsw <8 x i64> +; SLM: cost of 68 {{.*}} mul nsw <8 x i64> %res = mul nsw <8 x i64> %a, %b ret <8 x i64> %res } define <16 x i64> @slm-costs_64_v16_mul(<16 x i64> %a, <16 x i64> %b) { entry: -; SLM: cost of 88 {{.*}} mul nsw <16 x i64> +; SLM: cost of 136 {{.*}} mul nsw <16 x i64> %res = mul nsw <16 x i64> %a, %b ret <16 x i64> %res } diff --git a/test/Analysis/DependenceAnalysis/BasePtrBug.ll b/test/Analysis/DependenceAnalysis/BasePtrBug.ll new file mode 100644 index 0000000000000..8de75df7dbdd7 --- /dev/null +++ b/test/Analysis/DependenceAnalysis/BasePtrBug.ll @@ -0,0 +1,80 @@ +; RUN: opt < %s -analyze -basicaa -da | FileCheck %s + +; Test that the dependence analysis generates the correct results when using +; an aliased object that points to a different element in the same array. +; PR33567 - https://bugs.llvm.org/show_bug.cgi?id=33567 + +; void test1(int *A, int *B, int N) { +; int *top = A; +; int *bot = A + N/2; +; for (int i = 0; i < N; i++) +; B[i] = top[i] + bot[i]; +; } + +; CHECK-LABEL: test1 +; CHECK: da analyze - input [*|<]! + +define void @test1(i32* nocapture %A, i32* nocapture %B, i32 %N) #0 { +entry: + %cmp9 = icmp sgt i32 %N, 0 + br i1 %cmp9, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %div = sdiv i32 %N, 2 + %bot.gep = getelementptr i32, i32* %A, i32 %div + br label %for.body + +for.body: + %i = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %gep.0 = getelementptr i32, i32* %A, i32 %i + %gep.1 = getelementptr i32, i32* %bot.gep, i32 %i + %gep.B = getelementptr i32, i32* %B, i32 %i + %0 = load i32, i32* %gep.0, align 4 + %1 = load i32, i32* %gep.1, align 4 + %add = add nsw i32 %1, %0 + store i32 %add, i32* %gep.B, align 4 + %inc = add nsw i32 %i, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + + +; void test2(int *A, unsigned n) { +; int *B = A + 1; +; for (unsigned i = 0; i < n; ++i) { +; A[i] = B[i]; +; } +; } + +; CHECK-LABEL: test2 +; CHECK: da analyze - consistent anti [1]! + +define void @test2(i32*, i32) #3 { + %3 = getelementptr inbounds i32, i32* %0, i64 1 + br label %4 + +; <label>:4: + %.0 = phi i32 [ 0, %2 ], [ %14, %13 ] + %5 = sub i32 %1, 1 + %6 = icmp ult i32 %.0, %5 + br i1 %6, label %7, label %15 + +; <label>:7: + %8 = zext i32 %.0 to i64 + %9 = getelementptr inbounds i32, i32* %3, i64 %8 + %10 = load i32, i32* %9, align 4 + %11 = zext i32 %.0 to i64 + %12 = getelementptr inbounds i32, i32* %0, i64 %11 + store i32 %10, i32* %12, align 4 + br label %13 + +; <label>:13: + %14 = add i32 %.0, 1 + br label %4 + +; <label>:15: + ret void +} diff --git a/test/Analysis/ScalarEvolution/guards.ll b/test/Analysis/ScalarEvolution/guards.ll index 52ad4dc73d417..d4b1f431ffc6a 100644 --- a/test/Analysis/ScalarEvolution/guards.ll +++ b/test/Analysis/ScalarEvolution/guards.ll @@ -19,7 +19,7 @@ entry: loop: ; CHECK: loop: ; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 true) [ "deopt"() ] -; CHECK: %iv.inc.cmp = icmp slt i32 %iv.inc, %len +; CHECK: %iv.inc.cmp = icmp ult i32 %iv.inc, %len ; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %iv.inc.cmp) [ "deopt"() ] ; CHECK: leave: @@ -41,7 +41,7 @@ leave: define void @test_2(i32 %n, i32* %len_buf) { ; CHECK-LABEL: @test_2( -; CHECK: [[LEN_SEXT:%[^ ]+]] = sext i32 %len to i64 +; CHECK: [[LEN_ZEXT:%[^ ]+]] = zext i32 %len to i64 ; CHECK: br label %loop entry: @@ -52,7 +52,7 @@ loop: ; CHECK: loop: ; CHECK: %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ] ; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -; CHECK: %iv.inc.cmp = icmp slt i64 %indvars.iv.next, [[LEN_SEXT]] +; CHECK: %iv.inc.cmp = icmp ult i64 %indvars.iv.next, [[LEN_ZEXT]] ; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %iv.inc.cmp) [ "deopt"() ] ; CHECK: leave: diff --git a/test/Assembler/2003-11-11-ImplicitRename.ll b/test/Assembler/2003-11-11-ImplicitRename.ll index 7bfd3c14bf194..84065a17846db 100644 --- a/test/Assembler/2003-11-11-ImplicitRename.ll +++ b/test/Assembler/2003-11-11-ImplicitRename.ll @@ -1,8 +1,7 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 void %test() { %X = add int 0, 1 %X = add int 1, 2 ret void } - diff --git a/test/Assembler/2007-11-26-AttributeOverload.ll b/test/Assembler/2007-11-26-AttributeOverload.ll index aebc2e8d01e5f..ab5d514a38b6e 100644 --- a/test/Assembler/2007-11-26-AttributeOverload.ll +++ b/test/Assembler/2007-11-26-AttributeOverload.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 declare i32 @atoi(i8*) nounwind readonly declare i32 @atoi(i8*) diff --git a/test/Assembler/atomic.ll b/test/Assembler/atomic.ll index 148b95d88e307..a8b527f2f863c 100644 --- a/test/Assembler/atomic.ll +++ b/test/Assembler/atomic.ll @@ -5,14 +5,20 @@ define void @f(i32* %x) { ; CHECK: load atomic i32, i32* %x unordered, align 4 load atomic i32, i32* %x unordered, align 4 - ; CHECK: load atomic volatile i32, i32* %x singlethread acquire, align 4 - load atomic volatile i32, i32* %x singlethread acquire, align 4 + ; CHECK: load atomic volatile i32, i32* %x syncscope("singlethread") acquire, align 4 + load atomic volatile i32, i32* %x syncscope("singlethread") acquire, align 4 + ; CHECK: load atomic volatile i32, i32* %x syncscope("agent") acquire, align 4 + load atomic volatile i32, i32* %x syncscope("agent") acquire, align 4 ; CHECK: store atomic i32 3, i32* %x release, align 4 store atomic i32 3, i32* %x release, align 4 - ; CHECK: store atomic volatile i32 3, i32* %x singlethread monotonic, align 4 - store atomic volatile i32 3, i32* %x singlethread monotonic, align 4 - ; CHECK: cmpxchg i32* %x, i32 1, i32 0 singlethread monotonic monotonic - cmpxchg i32* %x, i32 1, i32 0 singlethread monotonic monotonic + ; CHECK: store atomic volatile i32 3, i32* %x syncscope("singlethread") monotonic, align 4 + store atomic volatile i32 3, i32* %x syncscope("singlethread") monotonic, align 4 + ; CHECK: store atomic volatile i32 3, i32* %x syncscope("workgroup") monotonic, align 4 + store atomic volatile i32 3, i32* %x syncscope("workgroup") monotonic, align 4 + ; CHECK: cmpxchg i32* %x, i32 1, i32 0 syncscope("singlethread") monotonic monotonic + cmpxchg i32* %x, i32 1, i32 0 syncscope("singlethread") monotonic monotonic + ; CHECK: cmpxchg i32* %x, i32 1, i32 0 syncscope("workitem") monotonic monotonic + cmpxchg i32* %x, i32 1, i32 0 syncscope("workitem") monotonic monotonic ; CHECK: cmpxchg volatile i32* %x, i32 0, i32 1 acq_rel acquire cmpxchg volatile i32* %x, i32 0, i32 1 acq_rel acquire ; CHECK: cmpxchg i32* %x, i32 42, i32 0 acq_rel monotonic @@ -23,9 +29,13 @@ define void @f(i32* %x) { atomicrmw add i32* %x, i32 10 seq_cst ; CHECK: atomicrmw volatile xchg i32* %x, i32 10 monotonic atomicrmw volatile xchg i32* %x, i32 10 monotonic - ; CHECK: fence singlethread release - fence singlethread release + ; CHECK: atomicrmw volatile xchg i32* %x, i32 10 syncscope("agent") monotonic + atomicrmw volatile xchg i32* %x, i32 10 syncscope("agent") monotonic + ; CHECK: fence syncscope("singlethread") release + fence syncscope("singlethread") release ; CHECK: fence seq_cst fence seq_cst + ; CHECK: fence syncscope("device") seq_cst + fence syncscope("device") seq_cst ret void } diff --git a/test/Bitcode/Inputs/module-hash-strtab1.ll b/test/Bitcode/Inputs/module-hash-strtab1.ll new file mode 100644 index 0000000000000..6b4a3fce07eff --- /dev/null +++ b/test/Bitcode/Inputs/module-hash-strtab1.ll @@ -0,0 +1,10 @@ +source_filename = "foo.c" + +$com = comdat any + +define void @main() comdat($com) { + call void @bar() + ret void +} + +declare void @bar() diff --git a/test/Bitcode/Inputs/module-hash-strtab2.ll b/test/Bitcode/Inputs/module-hash-strtab2.ll new file mode 100644 index 0000000000000..87d2478145bc3 --- /dev/null +++ b/test/Bitcode/Inputs/module-hash-strtab2.ll @@ -0,0 +1,10 @@ +source_filename = "foo.c" + +$dat = comdat any + +define void @main() comdat($dat) { + call void @foo() + ret void +} + +declare void @foo() diff --git a/test/Bitcode/atomic-no-syncscope.ll b/test/Bitcode/atomic-no-syncscope.ll new file mode 100644 index 0000000000000..a57507bc81468 --- /dev/null +++ b/test/Bitcode/atomic-no-syncscope.ll @@ -0,0 +1,17 @@ +; RUN: llvm-dis -o - %s.bc | FileCheck %s + +; Backwards compatibility test: make sure we can process bitcode without +; synchronization scope names encoded in it. + +; CHECK: load atomic i32, i32* %x unordered, align 4 +; CHECK: load atomic volatile i32, i32* %x syncscope("singlethread") acquire, align 4 +; CHECK: store atomic i32 3, i32* %x release, align 4 +; CHECK: store atomic volatile i32 3, i32* %x syncscope("singlethread") monotonic, align 4 +; CHECK: cmpxchg i32* %x, i32 1, i32 0 syncscope("singlethread") monotonic monotonic +; CHECK: cmpxchg volatile i32* %x, i32 0, i32 1 acq_rel acquire +; CHECK: cmpxchg i32* %x, i32 42, i32 0 acq_rel monotonic +; CHECK: cmpxchg weak i32* %x, i32 13, i32 0 seq_cst monotonic +; CHECK: atomicrmw add i32* %x, i32 10 seq_cst +; CHECK: atomicrmw volatile xchg i32* %x, i32 10 monotonic +; CHECK: fence syncscope("singlethread") release +; CHECK: fence seq_cst diff --git a/test/Bitcode/atomic-no-syncscope.ll.bc b/test/Bitcode/atomic-no-syncscope.ll.bc Binary files differnew file mode 100644 index 0000000000000..01d565eb4426e --- /dev/null +++ b/test/Bitcode/atomic-no-syncscope.ll.bc diff --git a/test/Bitcode/atomic.ll b/test/Bitcode/atomic.ll index c09e74c1c2f24..bef3f2712935a 100644 --- a/test/Bitcode/atomic.ll +++ b/test/Bitcode/atomic.ll @@ -11,8 +11,8 @@ define void @test_cmpxchg(i32* %addr, i32 %desired, i32 %new) { cmpxchg weak i32* %addr, i32 %desired, i32 %new acq_rel acquire ; CHECK: cmpxchg weak i32* %addr, i32 %desired, i32 %new acq_rel acquire - cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new singlethread release monotonic - ; CHECK: cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new singlethread release monotonic + cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new syncscope("singlethread") release monotonic + ; CHECK: cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new syncscope("singlethread") release monotonic ret void } diff --git a/test/Bitcode/compatibility-3.6.ll b/test/Bitcode/compatibility-3.6.ll index 8d51ee11a209b..cf6c30e7c26c1 100644 --- a/test/Bitcode/compatibility-3.6.ll +++ b/test/Bitcode/compatibility-3.6.ll @@ -551,8 +551,8 @@ define void @atomics(i32* %word) { ; CHECK: %cmpxchg.5 = cmpxchg weak i32* %word, i32 0, i32 9 seq_cst monotonic %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic ; CHECK: %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic - %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic - ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic + %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic + ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic ; CHECK: %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic %atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic @@ -571,33 +571,33 @@ define void @atomics(i32* %word) { ; CHECK: %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic ; CHECK: %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic - %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic - ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic - %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic - ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic + %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic + ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic + %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic + ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic fence acquire ; CHECK: fence acquire fence release ; CHECK: fence release fence acq_rel ; CHECK: fence acq_rel - fence singlethread seq_cst - ; CHECK: fence singlethread seq_cst + fence syncscope("singlethread") seq_cst + ; CHECK: fence syncscope("singlethread") seq_cst ; XXX: The parser spits out the load type here. %ld.1 = load atomic i32* %word monotonic, align 4 ; CHECK: %ld.1 = load atomic i32, i32* %word monotonic, align 4 %ld.2 = load atomic volatile i32* %word acquire, align 8 ; CHECK: %ld.2 = load atomic volatile i32, i32* %word acquire, align 8 - %ld.3 = load atomic volatile i32* %word singlethread seq_cst, align 16 - ; CHECK: %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16 + %ld.3 = load atomic volatile i32* %word syncscope("singlethread") seq_cst, align 16 + ; CHECK: %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16 store atomic i32 23, i32* %word monotonic, align 4 ; CHECK: store atomic i32 23, i32* %word monotonic, align 4 store atomic volatile i32 24, i32* %word monotonic, align 4 ; CHECK: store atomic volatile i32 24, i32* %word monotonic, align 4 - store atomic volatile i32 25, i32* %word singlethread monotonic, align 4 - ; CHECK: store atomic volatile i32 25, i32* %word singlethread monotonic, align 4 + store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4 + ; CHECK: store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4 ret void } diff --git a/test/Bitcode/compatibility-3.7.ll b/test/Bitcode/compatibility-3.7.ll index ebdf4c30587c9..180dad258b682 100644 --- a/test/Bitcode/compatibility-3.7.ll +++ b/test/Bitcode/compatibility-3.7.ll @@ -596,8 +596,8 @@ define void @atomics(i32* %word) { ; CHECK: %cmpxchg.5 = cmpxchg weak i32* %word, i32 0, i32 9 seq_cst monotonic %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic ; CHECK: %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic - %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic - ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic + %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic + ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic ; CHECK: %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic %atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic @@ -616,32 +616,32 @@ define void @atomics(i32* %word) { ; CHECK: %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic ; CHECK: %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic - %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic - ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic - %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic - ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic + %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic + ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic + %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic + ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic fence acquire ; CHECK: fence acquire fence release ; CHECK: fence release fence acq_rel ; CHECK: fence acq_rel - fence singlethread seq_cst - ; CHECK: fence singlethread seq_cst + fence syncscope("singlethread") seq_cst + ; CHECK: fence syncscope("singlethread") seq_cst %ld.1 = load atomic i32, i32* %word monotonic, align 4 ; CHECK: %ld.1 = load atomic i32, i32* %word monotonic, align 4 %ld.2 = load atomic volatile i32, i32* %word acquire, align 8 ; CHECK: %ld.2 = load atomic volatile i32, i32* %word acquire, align 8 - %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16 - ; CHECK: %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16 + %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16 + ; CHECK: %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16 store atomic i32 23, i32* %word monotonic, align 4 ; CHECK: store atomic i32 23, i32* %word monotonic, align 4 store atomic volatile i32 24, i32* %word monotonic, align 4 ; CHECK: store atomic volatile i32 24, i32* %word monotonic, align 4 - store atomic volatile i32 25, i32* %word singlethread monotonic, align 4 - ; CHECK: store atomic volatile i32 25, i32* %word singlethread monotonic, align 4 + store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4 + ; CHECK: store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4 ret void } diff --git a/test/Bitcode/compatibility-3.8.ll b/test/Bitcode/compatibility-3.8.ll index 57ea3e068376f..370c7f51a2b7d 100644 --- a/test/Bitcode/compatibility-3.8.ll +++ b/test/Bitcode/compatibility-3.8.ll @@ -627,8 +627,8 @@ define void @atomics(i32* %word) { ; CHECK: %cmpxchg.5 = cmpxchg weak i32* %word, i32 0, i32 9 seq_cst monotonic %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic ; CHECK: %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic - %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic - ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic + %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic + ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic ; CHECK: %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic %atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic @@ -647,32 +647,32 @@ define void @atomics(i32* %word) { ; CHECK: %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic ; CHECK: %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic - %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic - ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic - %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic - ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic + %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic + ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic + %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic + ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic fence acquire ; CHECK: fence acquire fence release ; CHECK: fence release fence acq_rel ; CHECK: fence acq_rel - fence singlethread seq_cst - ; CHECK: fence singlethread seq_cst + fence syncscope("singlethread") seq_cst + ; CHECK: fence syncscope("singlethread") seq_cst %ld.1 = load atomic i32, i32* %word monotonic, align 4 ; CHECK: %ld.1 = load atomic i32, i32* %word monotonic, align 4 %ld.2 = load atomic volatile i32, i32* %word acquire, align 8 ; CHECK: %ld.2 = load atomic volatile i32, i32* %word acquire, align 8 - %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16 - ; CHECK: %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16 + %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16 + ; CHECK: %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16 store atomic i32 23, i32* %word monotonic, align 4 ; CHECK: store atomic i32 23, i32* %word monotonic, align 4 store atomic volatile i32 24, i32* %word monotonic, align 4 ; CHECK: store atomic volatile i32 24, i32* %word monotonic, align 4 - store atomic volatile i32 25, i32* %word singlethread monotonic, align 4 - ; CHECK: store atomic volatile i32 25, i32* %word singlethread monotonic, align 4 + store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4 + ; CHECK: store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4 ret void } diff --git a/test/Bitcode/compatibility-3.9.ll b/test/Bitcode/compatibility-3.9.ll index 2a6cfe14cdb14..4115cbd8fe64d 100644 --- a/test/Bitcode/compatibility-3.9.ll +++ b/test/Bitcode/compatibility-3.9.ll @@ -698,8 +698,8 @@ define void @atomics(i32* %word) { ; CHECK: %cmpxchg.5 = cmpxchg weak i32* %word, i32 0, i32 9 seq_cst monotonic %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic ; CHECK: %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic - %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic - ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic + %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic + ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic ; CHECK: %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic %atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic @@ -718,32 +718,32 @@ define void @atomics(i32* %word) { ; CHECK: %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic ; CHECK: %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic - %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic - ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic - %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic - ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic + %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic + ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic + %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic + ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic fence acquire ; CHECK: fence acquire fence release ; CHECK: fence release fence acq_rel ; CHECK: fence acq_rel - fence singlethread seq_cst - ; CHECK: fence singlethread seq_cst + fence syncscope("singlethread") seq_cst + ; CHECK: fence syncscope("singlethread") seq_cst %ld.1 = load atomic i32, i32* %word monotonic, align 4 ; CHECK: %ld.1 = load atomic i32, i32* %word monotonic, align 4 %ld.2 = load atomic volatile i32, i32* %word acquire, align 8 ; CHECK: %ld.2 = load atomic volatile i32, i32* %word acquire, align 8 - %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16 - ; CHECK: %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16 + %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16 + ; CHECK: %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16 store atomic i32 23, i32* %word monotonic, align 4 ; CHECK: store atomic i32 23, i32* %word monotonic, align 4 store atomic volatile i32 24, i32* %word monotonic, align 4 ; CHECK: store atomic volatile i32 24, i32* %word monotonic, align 4 - store atomic volatile i32 25, i32* %word singlethread monotonic, align 4 - ; CHECK: store atomic volatile i32 25, i32* %word singlethread monotonic, align 4 + store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4 + ; CHECK: store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4 ret void } diff --git a/test/Bitcode/compatibility-4.0.ll b/test/Bitcode/compatibility-4.0.ll index c83c107a2927a..eef925564ecbf 100644 --- a/test/Bitcode/compatibility-4.0.ll +++ b/test/Bitcode/compatibility-4.0.ll @@ -698,8 +698,8 @@ define void @atomics(i32* %word) { ; CHECK: %cmpxchg.5 = cmpxchg weak i32* %word, i32 0, i32 9 seq_cst monotonic %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic ; CHECK: %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic - %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic - ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic + %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic + ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic ; CHECK: %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic %atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic @@ -718,32 +718,32 @@ define void @atomics(i32* %word) { ; CHECK: %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic ; CHECK: %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic - %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic - ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic - %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic - ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic + %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic + ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic + %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic + ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic fence acquire ; CHECK: fence acquire fence release ; CHECK: fence release fence acq_rel ; CHECK: fence acq_rel - fence singlethread seq_cst - ; CHECK: fence singlethread seq_cst + fence syncscope("singlethread") seq_cst + ; CHECK: fence syncscope("singlethread") seq_cst %ld.1 = load atomic i32, i32* %word monotonic, align 4 ; CHECK: %ld.1 = load atomic i32, i32* %word monotonic, align 4 %ld.2 = load atomic volatile i32, i32* %word acquire, align 8 ; CHECK: %ld.2 = load atomic volatile i32, i32* %word acquire, align 8 - %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16 - ; CHECK: %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16 + %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16 + ; CHECK: %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16 store atomic i32 23, i32* %word monotonic, align 4 ; CHECK: store atomic i32 23, i32* %word monotonic, align 4 store atomic volatile i32 24, i32* %word monotonic, align 4 ; CHECK: store atomic volatile i32 24, i32* %word monotonic, align 4 - store atomic volatile i32 25, i32* %word singlethread monotonic, align 4 - ; CHECK: store atomic volatile i32 25, i32* %word singlethread monotonic, align 4 + store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4 + ; CHECK: store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4 ret void } diff --git a/test/Bitcode/compatibility.ll b/test/Bitcode/compatibility.ll index ec69344947c5e..ebd727ba9aeee 100644 --- a/test/Bitcode/compatibility.ll +++ b/test/Bitcode/compatibility.ll @@ -705,8 +705,8 @@ define void @atomics(i32* %word) { ; CHECK: %cmpxchg.5 = cmpxchg weak i32* %word, i32 0, i32 9 seq_cst monotonic %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic ; CHECK: %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic - %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic - ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic + %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic + ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 syncscope("singlethread") seq_cst monotonic %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic ; CHECK: %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic %atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic @@ -725,32 +725,32 @@ define void @atomics(i32* %word) { ; CHECK: %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic ; CHECK: %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic - %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic - ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic - %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic - ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic + %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic + ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 syncscope("singlethread") monotonic + %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic + ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 syncscope("singlethread") monotonic fence acquire ; CHECK: fence acquire fence release ; CHECK: fence release fence acq_rel ; CHECK: fence acq_rel - fence singlethread seq_cst - ; CHECK: fence singlethread seq_cst + fence syncscope("singlethread") seq_cst + ; CHECK: fence syncscope("singlethread") seq_cst %ld.1 = load atomic i32, i32* %word monotonic, align 4 ; CHECK: %ld.1 = load atomic i32, i32* %word monotonic, align 4 %ld.2 = load atomic volatile i32, i32* %word acquire, align 8 ; CHECK: %ld.2 = load atomic volatile i32, i32* %word acquire, align 8 - %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16 - ; CHECK: %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16 + %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16 + ; CHECK: %ld.3 = load atomic volatile i32, i32* %word syncscope("singlethread") seq_cst, align 16 store atomic i32 23, i32* %word monotonic, align 4 ; CHECK: store atomic i32 23, i32* %word monotonic, align 4 store atomic volatile i32 24, i32* %word monotonic, align 4 ; CHECK: store atomic volatile i32 24, i32* %word monotonic, align 4 - store atomic volatile i32 25, i32* %word singlethread monotonic, align 4 - ; CHECK: store atomic volatile i32 25, i32* %word singlethread monotonic, align 4 + store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4 + ; CHECK: store atomic volatile i32 25, i32* %word syncscope("singlethread") monotonic, align 4 ret void } diff --git a/test/Bitcode/memInstructions.3.2.ll b/test/Bitcode/memInstructions.3.2.ll index 1ab05b6d1b422..c530b6d2cb174 100644 --- a/test/Bitcode/memInstructions.3.2.ll +++ b/test/Bitcode/memInstructions.3.2.ll @@ -107,29 +107,29 @@ entry: ; CHECK-NEXT: %res8 = load atomic volatile i8, i8* %ptr1 seq_cst, align 1 %res8 = load atomic volatile i8, i8* %ptr1 seq_cst, align 1 -; CHECK-NEXT: %res9 = load atomic i8, i8* %ptr1 singlethread unordered, align 1 - %res9 = load atomic i8, i8* %ptr1 singlethread unordered, align 1 +; CHECK-NEXT: %res9 = load atomic i8, i8* %ptr1 syncscope("singlethread") unordered, align 1 + %res9 = load atomic i8, i8* %ptr1 syncscope("singlethread") unordered, align 1 -; CHECK-NEXT: %res10 = load atomic i8, i8* %ptr1 singlethread monotonic, align 1 - %res10 = load atomic i8, i8* %ptr1 singlethread monotonic, align 1 +; CHECK-NEXT: %res10 = load atomic i8, i8* %ptr1 syncscope("singlethread") monotonic, align 1 + %res10 = load atomic i8, i8* %ptr1 syncscope("singlethread") monotonic, align 1 -; CHECK-NEXT: %res11 = load atomic i8, i8* %ptr1 singlethread acquire, align 1 - %res11 = load atomic i8, i8* %ptr1 singlethread acquire, align 1 +; CHECK-NEXT: %res11 = load atomic i8, i8* %ptr1 syncscope("singlethread") acquire, align 1 + %res11 = load atomic i8, i8* %ptr1 syncscope("singlethread") acquire, align 1 -; CHECK-NEXT: %res12 = load atomic i8, i8* %ptr1 singlethread seq_cst, align 1 - %res12 = load atomic i8, i8* %ptr1 singlethread seq_cst, align 1 +; CHECK-NEXT: %res12 = load atomic i8, i8* %ptr1 syncscope("singlethread") seq_cst, align 1 + %res12 = load atomic i8, i8* %ptr1 syncscope("singlethread") seq_cst, align 1 -; CHECK-NEXT: %res13 = load atomic volatile i8, i8* %ptr1 singlethread unordered, align 1 - %res13 = load atomic volatile i8, i8* %ptr1 singlethread unordered, align 1 +; CHECK-NEXT: %res13 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") unordered, align 1 + %res13 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") unordered, align 1 -; CHECK-NEXT: %res14 = load atomic volatile i8, i8* %ptr1 singlethread monotonic, align 1 - %res14 = load atomic volatile i8, i8* %ptr1 singlethread monotonic, align 1 +; CHECK-NEXT: %res14 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") monotonic, align 1 + %res14 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") monotonic, align 1 -; CHECK-NEXT: %res15 = load atomic volatile i8, i8* %ptr1 singlethread acquire, align 1 - %res15 = load atomic volatile i8, i8* %ptr1 singlethread acquire, align 1 +; CHECK-NEXT: %res15 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") acquire, align 1 + %res15 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") acquire, align 1 -; CHECK-NEXT: %res16 = load atomic volatile i8, i8* %ptr1 singlethread seq_cst, align 1 - %res16 = load atomic volatile i8, i8* %ptr1 singlethread seq_cst, align 1 +; CHECK-NEXT: %res16 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") seq_cst, align 1 + %res16 = load atomic volatile i8, i8* %ptr1 syncscope("singlethread") seq_cst, align 1 ret void } @@ -193,29 +193,29 @@ entry: ; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 seq_cst, align 1 store atomic volatile i8 2, i8* %ptr1 seq_cst, align 1 -; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread unordered, align 1 - store atomic i8 2, i8* %ptr1 singlethread unordered, align 1 +; CHECK-NEXT: store atomic i8 2, i8* %ptr1 syncscope("singlethread") unordered, align 1 + store atomic i8 2, i8* %ptr1 syncscope("singlethread") unordered, align 1 -; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread monotonic, align 1 - store atomic i8 2, i8* %ptr1 singlethread monotonic, align 1 +; CHECK-NEXT: store atomic i8 2, i8* %ptr1 syncscope("singlethread") monotonic, align 1 + store atomic i8 2, i8* %ptr1 syncscope("singlethread") monotonic, align 1 -; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread release, align 1 - store atomic i8 2, i8* %ptr1 singlethread release, align 1 +; CHECK-NEXT: store atomic i8 2, i8* %ptr1 syncscope("singlethread") release, align 1 + store atomic i8 2, i8* %ptr1 syncscope("singlethread") release, align 1 -; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread seq_cst, align 1 - store atomic i8 2, i8* %ptr1 singlethread seq_cst, align 1 +; CHECK-NEXT: store atomic i8 2, i8* %ptr1 syncscope("singlethread") seq_cst, align 1 + store atomic i8 2, i8* %ptr1 syncscope("singlethread") seq_cst, align 1 -; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread unordered, align 1 - store atomic volatile i8 2, i8* %ptr1 singlethread unordered, align 1 +; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") unordered, align 1 + store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") unordered, align 1 -; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread monotonic, align 1 - store atomic volatile i8 2, i8* %ptr1 singlethread monotonic, align 1 +; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") monotonic, align 1 + store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") monotonic, align 1 -; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread release, align 1 - store atomic volatile i8 2, i8* %ptr1 singlethread release, align 1 +; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") release, align 1 + store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") release, align 1 -; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread seq_cst, align 1 - store atomic volatile i8 2, i8* %ptr1 singlethread seq_cst, align 1 +; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") seq_cst, align 1 + store atomic volatile i8 2, i8* %ptr1 syncscope("singlethread") seq_cst, align 1 ret void } @@ -232,13 +232,13 @@ entry: ; CHECK-NEXT: %res2 = extractvalue { i32, i1 } [[TMP]], 0 %res2 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new monotonic monotonic -; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic +; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") monotonic monotonic ; CHECK-NEXT: %res3 = extractvalue { i32, i1 } [[TMP]], 0 - %res3 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic + %res3 = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") monotonic monotonic -; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic +; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") monotonic monotonic ; CHECK-NEXT: %res4 = extractvalue { i32, i1 } [[TMP]], 0 - %res4 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic + %res4 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") monotonic monotonic ; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new acquire acquire @@ -249,13 +249,13 @@ entry: ; CHECK-NEXT: %res6 = extractvalue { i32, i1 } [[TMP]], 0 %res6 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acquire acquire -; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire +; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acquire acquire ; CHECK-NEXT: %res7 = extractvalue { i32, i1 } [[TMP]], 0 - %res7 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire + %res7 = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acquire acquire -; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire +; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acquire acquire ; CHECK-NEXT: %res8 = extractvalue { i32, i1 } [[TMP]], 0 - %res8 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire + %res8 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acquire acquire ; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new release monotonic @@ -266,13 +266,13 @@ entry: ; CHECK-NEXT: %res10 = extractvalue { i32, i1 } [[TMP]], 0 %res10 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new release monotonic -; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic +; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") release monotonic ; CHECK-NEXT: %res11 = extractvalue { i32, i1 } [[TMP]], 0 - %res11 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic + %res11 = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") release monotonic -; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic +; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") release monotonic ; CHECK-NEXT: %res12 = extractvalue { i32, i1 } [[TMP]], 0 - %res12 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic + %res12 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") release monotonic ; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new acq_rel acquire @@ -283,13 +283,13 @@ entry: ; CHECK-NEXT: %res14 = extractvalue { i32, i1 } [[TMP]], 0 %res14 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acq_rel acquire -; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire +; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acq_rel acquire ; CHECK-NEXT: %res15 = extractvalue { i32, i1 } [[TMP]], 0 - %res15 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire + %res15 = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acq_rel acquire -; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire +; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acq_rel acquire ; CHECK-NEXT: %res16 = extractvalue { i32, i1 } [[TMP]], 0 - %res16 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire + %res16 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") acq_rel acquire ; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst @@ -300,13 +300,13 @@ entry: ; CHECK-NEXT: %res18 = extractvalue { i32, i1 } [[TMP]], 0 %res18 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst -; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst +; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") seq_cst seq_cst ; CHECK-NEXT: %res19 = extractvalue { i32, i1 } [[TMP]], 0 - %res19 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst + %res19 = cmpxchg i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") seq_cst seq_cst -; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst +; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") seq_cst seq_cst ; CHECK-NEXT: %res20 = extractvalue { i32, i1 } [[TMP]], 0 - %res20 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst + %res20 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new syncscope("singlethread") seq_cst seq_cst ret void } diff --git a/test/Bitcode/module-hash-strtab.ll b/test/Bitcode/module-hash-strtab.ll new file mode 100644 index 0000000000000..e5a1fb0c40779 --- /dev/null +++ b/test/Bitcode/module-hash-strtab.ll @@ -0,0 +1,15 @@ +; RUN: opt -module-hash %s -o - | llvm-bcanalyzer -dump | grep '<HASH' > %t +; RUN: opt -module-hash %S/Inputs/module-hash-strtab1.ll -o - | llvm-bcanalyzer -dump | grep '<HASH' >> %t +; RUN: opt -module-hash %S/Inputs/module-hash-strtab2.ll -o - | llvm-bcanalyzer -dump | grep '<HASH' >> %t +; RUN: sort %t | uniq | count 3 + +source_filename = "foo.c" + +$com = comdat any + +define void @main() comdat($com) { + call void @foo() + ret void +} + +declare void @foo() diff --git a/test/Bitcode/module_hash.ll b/test/Bitcode/module_hash.ll index 56f3fdc4b7eaa..b24819fe6fdec 100644 --- a/test/Bitcode/module_hash.ll +++ b/test/Bitcode/module_hash.ll @@ -1,7 +1,7 @@ ; Check per module hash. -; RUN: opt -module-hash %s -o - | llvm-bcanalyzer -dump | FileCheck %s --check-prefix=MOD1 +; RUN: opt -module-hash %s -o - | llvm-bcanalyzer -dump -check-hash=foo | FileCheck %s --check-prefix=MOD1 ; MOD1: <HASH op0={{[0-9]*}} op1={{[0-9]*}} op2={{[0-9]*}} op3={{[0-9]*}} op4={{[0-9]*}} (match)/> -; RUN: opt -module-hash %p/Inputs/module_hash.ll -o - | llvm-bcanalyzer -dump | FileCheck %s --check-prefix=MOD2 +; RUN: opt -module-hash %p/Inputs/module_hash.ll -o - | llvm-bcanalyzer -dump -check-hash=bar | FileCheck %s --check-prefix=MOD2 ; MOD2: <HASH op0={{[0-9]*}} op1={{[0-9]*}} op2={{[0-9]*}} op3={{[0-9]*}} op4={{[0-9]*}} (match)/> ; Check that the hash matches in the combined index. @@ -21,8 +21,8 @@ ; RUN: cat %t.hash | FileCheck %s --check-prefix=COMBINED ; First capture the value of the hash for the two modules. -; COMBINED: <HASH op0=[[HASH1_1:[0-9]*]] op1=[[HASH1_2:[0-9]*]] op2=[[HASH1_3:[0-9]*]] op3=[[HASH1_4:[0-9]*]] op4=[[HASH1_5:[0-9]*]] (match)/> -; COMBINED: <HASH op0=[[HASH2_1:[0-9]*]] op1=[[HASH2_2:[0-9]*]] op2=[[HASH2_3:[0-9]*]] op3=[[HASH2_4:[0-9]*]] op4=[[HASH2_5:[0-9]*]] (match)/> +; COMBINED: <HASH op0=[[HASH1_1:[0-9]*]] op1=[[HASH1_2:[0-9]*]] op2=[[HASH1_3:[0-9]*]] op3=[[HASH1_4:[0-9]*]] op4=[[HASH1_5:[0-9]*]]/> +; COMBINED: <HASH op0=[[HASH2_1:[0-9]*]] op1=[[HASH2_2:[0-9]*]] op2=[[HASH2_3:[0-9]*]] op3=[[HASH2_4:[0-9]*]] op4=[[HASH2_5:[0-9]*]]/> ; Validate against the value extracted from the combined index ; COMBINED-DAG: <HASH abbrevid={{[0-9]*}} op0=[[HASH1_1]] op1=[[HASH1_2]] op2=[[HASH1_3]] op3=[[HASH1_4]] op4=[[HASH1_5]]/> diff --git a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll index a4d259add6093..86766f194688c 100644 --- a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll +++ b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll @@ -29,7 +29,7 @@ ; CHECK-NEXT: <VERSION ; CHECK-NEXT: <VALUE_GUID op0=25 op1=123/> ; op4=hot1 op6=cold op8=hot2 op10=hot4 op12=none1 op14=hot3 op16=none2 op18=none3 op20=123 -; CHECK-NEXT: <PERMODULE_PROFILE {{.*}} op4=1 op5=3 op6=5 op7=1 op8=2 op9=3 op10=4 op11=1 op12=6 op13=2 op14=3 op15=3 op16=7 op17=2 op18=8 op19=2 op20=25 op21=3/> +; CHECK-NEXT: <PERMODULE_PROFILE {{.*}} op4=1 op5=3 op6=5 op7=1 op8=2 op9=3 op10=4 op11=1 op12=6 op13=2 op14=3 op15=3 op16=7 op17=2 op18=8 op19=2 op20=25 op21=4/> ; CHECK-NEXT: </GLOBALVAL_SUMMARY_BLOCK> ; CHECK: <STRTAB_BLOCK diff --git a/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll b/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll index b62090efe20b9..09a6bbcb51d5c 100644 --- a/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll +++ b/test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll @@ -29,7 +29,7 @@ ; CHECK-NEXT: <VERSION ; CHECK-NEXT: <VALUE_GUID op0=25 op1=123/> ; op4=hot1 op6=cold op8=hot2 op10=hot4 op12=none1 op14=hot3 op16=none2 op18=none3 op20=123 -; CHECK-NEXT: <PERMODULE_PROFILE {{.*}} op4=1 op5=3 op6=5 op7=1 op8=2 op9=3 op10=4 op11=3 op12=6 op13=2 op14=3 op15=3 op16=7 op17=2 op18=8 op19=2 op20=25 op21=3/> +; CHECK-NEXT: <PERMODULE_PROFILE {{.*}} op4=1 op5=3 op6=5 op7=1 op8=2 op9=3 op10=4 op11=3 op12=6 op13=2 op14=3 op15=3 op16=7 op17=2 op18=8 op19=2 op20=25 op21=4/> ; CHECK-NEXT: </GLOBALVAL_SUMMARY_BLOCK> ; CHECK: <STRTAB_BLOCK diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll index 50ad83feed859..10ce87c2a1873 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll @@ -1328,16 +1328,16 @@ define void @test_load_store_atomics(i8* %addr) { ; CHECK: G_STORE [[V0]](s8), [[ADDR]](p0) :: (store monotonic 1 into %ir.addr) ; CHECK: [[V1:%[0-9]+]](s8) = G_LOAD [[ADDR]](p0) :: (load acquire 1 from %ir.addr) ; CHECK: G_STORE [[V1]](s8), [[ADDR]](p0) :: (store release 1 into %ir.addr) -; CHECK: [[V2:%[0-9]+]](s8) = G_LOAD [[ADDR]](p0) :: (load singlethread seq_cst 1 from %ir.addr) -; CHECK: G_STORE [[V2]](s8), [[ADDR]](p0) :: (store singlethread monotonic 1 into %ir.addr) +; CHECK: [[V2:%[0-9]+]](s8) = G_LOAD [[ADDR]](p0) :: (load syncscope("singlethread") seq_cst 1 from %ir.addr) +; CHECK: G_STORE [[V2]](s8), [[ADDR]](p0) :: (store syncscope("singlethread") monotonic 1 into %ir.addr) %v0 = load atomic i8, i8* %addr unordered, align 1 store atomic i8 %v0, i8* %addr monotonic, align 1 %v1 = load atomic i8, i8* %addr acquire, align 1 store atomic i8 %v1, i8* %addr release, align 1 - %v2 = load atomic i8, i8* %addr singlethread seq_cst, align 1 - store atomic i8 %v2, i8* %addr singlethread monotonic, align 1 + %v2 = load atomic i8, i8* %addr syncscope("singlethread") seq_cst, align 1 + store atomic i8 %v2, i8* %addr syncscope("singlethread") monotonic, align 1 ret void } diff --git a/test/CodeGen/AArch64/GlobalISel/select-implicit-def.mir b/test/CodeGen/AArch64/GlobalISel/select-implicit-def.mir new file mode 100644 index 0000000000000..8604b2769ba30 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-implicit-def.mir @@ -0,0 +1,30 @@ +# RUN: llc -O0 -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @implicit_def() { ret void } +... + +--- +# CHECK-LABEL: name: implicit_def +name: implicit_def +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32, preferred-register: '' } +# CHECK-NEXT: - { id: 1, class: gpr32, preferred-register: '' } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + +# CHECK: body: +# CHECK: [[DEF:%[0-9]+]] = IMPLICIT_DEF +# CHECK: [[ADD:%[0-9]+]] = ADDWrr [[DEF]], [[DEF]] +# CHECK: %w0 = COPY [[ADD]] +body: | + bb.0: + %0(s32) = G_IMPLICIT_DEF + %1(s32) = G_ADD %0, %0 + %w0 = COPY %1(s32) +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-intrinsic-aarch64-sdiv.mir b/test/CodeGen/AArch64/GlobalISel/select-intrinsic-aarch64-sdiv.mir new file mode 100644 index 0000000000000..43e682c6b6ca5 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-intrinsic-aarch64-sdiv.mir @@ -0,0 +1,38 @@ +# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + + define void @sdiv_s32_gpr() { ret void } +... + +--- +# Check that we select a 32-bit GPR sdiv intrinsic into SDIVWrr for GPR32. +# Also check that we constrain the register class of the COPY to GPR32. +# CHECK-LABEL: name: sdiv_s32_gpr +name: sdiv_s32_gpr +legalized: true +regBankSelected: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr32, preferred-register: '' } +# CHECK-NEXT: - { id: 1, class: gpr32, preferred-register: '' } +# CHECK-NEXT: - { id: 2, class: gpr32, preferred-register: '' } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + +# CHECK: body: +# CHECK: %0 = COPY %w0 +# CHECK: %1 = COPY %w1 +# CHECK: %2 = SDIVWr %0, %1 +body: | + bb.0: + liveins: %w0, %w1 + + %0(s32) = COPY %w0 + %1(s32) = COPY %w1 + %2(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.sdiv.i32), %0, %1 + %w0 = COPY %2(s32) +... diff --git a/test/CodeGen/AArch64/arm64-csldst-mmo.ll b/test/CodeGen/AArch64/arm64-csldst-mmo.ll index cfb8e3a38c492..37cc5411aa31b 100644 --- a/test/CodeGen/AArch64/arm64-csldst-mmo.ll +++ b/test/CodeGen/AArch64/arm64-csldst-mmo.ll @@ -13,9 +13,9 @@ ; CHECK: SU(2): STRWui %WZR ; CHECK: SU(3): %X21<def>, %X20<def> = LDPXi %SP ; CHECK: Predecessors: -; CHECK-NEXT: out SU(0) -; CHECK-NEXT: out SU(0) -; CHECK-NEXT: ord SU(0) +; CHECK-NEXT: SU(0): Out +; CHECK-NEXT: SU(0): Out +; CHECK-NEXT: SU(0): Ord ; CHECK-NEXT: Successors: define void @test1() { entry: diff --git a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll index cde62fcb3f95c..ad4feef7280f2 100644 --- a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll +++ b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll @@ -8,8 +8,8 @@ ; CHECK: shiftable ; CHECK: SU(2): %vreg2<def> = SUBXri %vreg1, 20, 0 ; CHECK: Successors: -; CHECK-NEXT: data SU(4): Latency=1 Reg=%vreg2 -; CHECK-NEXT: data SU(3): Latency=2 Reg=%vreg2 +; CHECK-NEXT: SU(4): Data Latency=1 Reg=%vreg2 +; CHECK-NEXT: SU(3): Data Latency=2 Reg=%vreg2 ; CHECK: ********** INTERVALS ********** define i64 @shiftable(i64 %A, i64 %B) { %tmp0 = sub i64 %B, 20 diff --git a/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll index 748a4762d82f4..9cbf0cb3803a8 100644 --- a/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll +++ b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll @@ -7,11 +7,11 @@ ; CHECK: misched_bug:BB#0 entry ; CHECK: SU(2): %vreg2<def> = LDRWui %vreg0, 1; mem:LD4[%ptr1_plus1] GPR32:%vreg2 GPR64common:%vreg0 ; CHECK: Successors: -; CHECK-NEXT: data SU(5): Latency=4 Reg=%vreg2 -; CHECK-NEXT: ord SU(4): Latency=0 +; CHECK-NEXT: SU(5): Data Latency=4 Reg=%vreg2 +; CHECK-NEXT: SU(4): Ord Latency=0 ; CHECK: SU(3): STRWui %WZR, %vreg0, 0; mem:ST4[%ptr1] GPR64common:%vreg0 ; CHECK: Successors: -; CHECK: ord SU(4): Latency=0 +; CHECK: SU(4): Ord Latency=0 ; CHECK: SU(4): STRWui %WZR, %vreg1, 0; mem:ST4[%ptr2] GPR64common:%vreg1 ; CHECK: SU(5): %W0<def> = COPY %vreg2; GPR32:%vreg2 ; CHECK: ** ScheduleDAGMI::schedule picking next node diff --git a/test/CodeGen/AArch64/fence-singlethread.ll b/test/CodeGen/AArch64/fence-singlethread.ll index 2ed744277385a..0af0e58a91d45 100644 --- a/test/CodeGen/AArch64/fence-singlethread.ll +++ b/test/CodeGen/AArch64/fence-singlethread.ll @@ -16,6 +16,6 @@ define void @fence_singlethread() { ; IOS: ; COMPILER BARRIER ; IOS-NOT: dmb - fence singlethread seq_cst + fence syncscope("singlethread") seq_cst ret void } diff --git a/test/CodeGen/AArch64/preferred-function-alignment.ll b/test/CodeGen/AArch64/preferred-function-alignment.ll new file mode 100644 index 0000000000000..88e6f5dd01c91 --- /dev/null +++ b/test/CodeGen/AArch64/preferred-function-alignment.ll @@ -0,0 +1,26 @@ +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=generic < %s | FileCheck --check-prefix=ALIGN2 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a35 < %s | FileCheck --check-prefix=ALIGN2 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a53 < %s | FileCheck --check-prefix=ALIGN2 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a73 < %s | FileCheck --check-prefix=ALIGN2 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cyclone < %s | FileCheck --check-prefix=ALIGN2 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=falkor < %s | FileCheck --check-prefix=ALIGN2 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=kryo < %s | FileCheck --check-prefix=ALIGN2 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderx < %s | FileCheck --check-prefix=ALIGN3 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderxt81 < %s | FileCheck --check-prefix=ALIGN3 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderxt83 < %s | FileCheck --check-prefix=ALIGN3 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderxt88 < %s | FileCheck --check-prefix=ALIGN3 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 < %s | FileCheck --check-prefix=ALIGN3 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 < %s | FileCheck --check-prefix=ALIGN4 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=cortex-a72 < %s | FileCheck --check-prefix=ALIGN4 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m1 < %s | FileCheck --check-prefix=ALIGN4 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m2 < %s | FileCheck --check-prefix=ALIGN4 %s +; RUN: llc -mtriple=aarch64-unknown-linux -mcpu=exynos-m3 < %s | FileCheck --check-prefix=ALIGN4 %s + +define void @test() { + ret void +} + +; CHECK-LABEL: test +; ALIGN2: .p2align 2 +; ALIGN3: .p2align 3 +; ALIGN4: .p2align 4 diff --git a/test/CodeGen/AArch64/tailcall_misched_graph.ll b/test/CodeGen/AArch64/tailcall_misched_graph.ll index 4fbd8944f0322..7e76dac214a14 100644 --- a/test/CodeGen/AArch64/tailcall_misched_graph.ll +++ b/test/CodeGen/AArch64/tailcall_misched_graph.ll @@ -37,8 +37,8 @@ declare void @callee2(i8*, i8*, i8*, i8*, i8*, ; CHECK: SU({{.*}}): [[VRB]]<def> = LDRXui <fi#-2> ; CHECK-NOT: SU ; CHECK: Successors: -; CHECK: ord SU([[DEPSTOREB:.*]]): Latency=0 -; CHECK: ord SU([[DEPSTOREA:.*]]): Latency=0 +; CHECK: SU([[DEPSTOREB:.*]]): Ord Latency=0 +; CHECK: SU([[DEPSTOREA:.*]]): Ord Latency=0 ; CHECK: SU([[DEPSTOREA]]): STRXui %vreg{{.*}}, <fi#-4> ; CHECK: SU([[DEPSTOREB]]): STRXui %vreg{{.*}}, <fi#-3> diff --git a/test/CodeGen/AMDGPU/add.i16.ll b/test/CodeGen/AMDGPU/add.i16.ll index bee13d8c17f1d..98848295a73b2 100644 --- a/test/CodeGen/AMDGPU/add.i16.ll +++ b/test/CodeGen/AMDGPU/add.i16.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}v_test_add_i16: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: buffer_store_short [[ADD]] define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -67,7 +67,7 @@ define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i1 ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i32: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -86,7 +86,7 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i1 ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] +; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]] ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:{{[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -105,7 +105,7 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i1 ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: buffer_store_dword [[SEXT]] define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { @@ -125,7 +125,7 @@ define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i1 ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll index 7e4546d2cfb3f..6dcd7c234dc6d 100644 --- a/test/CodeGen/AMDGPU/add.ll +++ b/test/CodeGen/AMDGPU/add.ll @@ -5,9 +5,9 @@ ;FUNC-LABEL: {{^}}test1: ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}} -;SI-NOT: [[REG]] -;SI: buffer_store_dword [[REG]], +;SI: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}} +;SI: v_mov_b32_e32 v[[REG]], s[[REG]] +;SI: buffer_store_dword v[[REG]], define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 %a = load i32, i32 addrspace(1)* %in @@ -21,8 +21,8 @@ define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 @@ -39,10 +39,10 @@ define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspa ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}} define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll index 76f724c2b90ba..4baa35ca57c58 100644 --- a/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/test/CodeGen/AMDGPU/add.v2i16.ll @@ -168,10 +168,10 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace( ; VI: flat_load_ushort v[[B_HI:[0-9]+]] ; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]] +; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] ; VI-NOT: and ; VI-NOT: shl -; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]] +; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] ; VI-NOT: and ; VI-NOT: shl ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} diff --git a/test/CodeGen/AMDGPU/add_i128.ll b/test/CodeGen/AMDGPU/add_i128.ll index 00a125c2e44fb..d33965d4dda7a 100644 --- a/test/CodeGen/AMDGPU/add_i128.ll +++ b/test/CodeGen/AMDGPU/add_i128.ll @@ -19,10 +19,10 @@ define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 ; Check that the SGPR add operand is correctly moved to a VGPR. ; GCN-LABEL: {{^}}sgpr_operand: -; GCN: v_add_i32 -; GCN: v_addc_u32 -; GCN: v_addc_u32 -; GCN: v_addc_u32 +; GCN: s_add_u32 +; GCN: s_addc_u32 +; GCN: s_addc_u32 +; GCN: s_addc_u32 define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { %foo = load i128, i128 addrspace(1)* %in, align 8 %result = add i128 %foo, %a @@ -31,10 +31,10 @@ define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 ad } ; GCN-LABEL: {{^}}sgpr_operand_reversed: -; GCN: v_add_i32 -; GCN: v_addc_u32 -; GCN: v_addc_u32 -; GCN: v_addc_u32 +; GCN: s_add_u32 +; GCN: s_addc_u32 +; GCN: s_addc_u32 +; GCN: s_addc_u32 define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) { %foo = load i128, i128 addrspace(1)* %in, align 8 %result = add i128 %a, %foo diff --git a/test/CodeGen/AMDGPU/add_i64.ll b/test/CodeGen/AMDGPU/add_i64.ll index 62733d5bfb6c9..f673d91192b84 100644 --- a/test/CodeGen/AMDGPU/add_i64.ll +++ b/test/CodeGen/AMDGPU/add_i64.ll @@ -19,8 +19,8 @@ define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 add ; Check that the SGPR add operand is correctly moved to a VGPR. ; SI-LABEL: {{^}}sgpr_operand: -; SI: v_add_i32 -; SI: v_addc_u32 +; SI: s_add_u32 +; SI: s_addc_u32 define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) { %foo = load i64, i64 addrspace(1)* %in, align 8 %result = add i64 %foo, %a @@ -32,8 +32,8 @@ define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addr ; SGPR as other operand. ; ; SI-LABEL: {{^}}sgpr_operand_reversed: -; SI: v_add_i32 -; SI: v_addc_u32 +; SI: s_add_u32 +; SI: s_addc_u32 define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) { %foo = load i64, i64 addrspace(1)* %in, align 8 %result = add i64 %a, %foo diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll index b1e71722d80c5..a6aa9e7951515 100644 --- a/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/test/CodeGen/AMDGPU/addrspacecast.ll @@ -10,20 +10,22 @@ ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] +; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 +; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc +; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] +; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} ; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16) ; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 ; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base - -; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] - -; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 -; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]] -; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] -; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 +; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc +; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] @@ -48,6 +50,12 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] +; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0 +; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc +; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] + ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} ; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(15, 0, 16) ; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16 @@ -55,12 +63,11 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %pt ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base -; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] - -; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0 -; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]] -; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] -; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 +; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], 0 +; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc +; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] diff --git a/test/CodeGen/AMDGPU/alignbit-pat.ll b/test/CodeGen/AMDGPU/alignbit-pat.ll index ff5c8960fad36..3f07188063cde 100644 --- a/test/CodeGen/AMDGPU/alignbit-pat.ll +++ b/test/CodeGen/AMDGPU/alignbit-pat.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}alignbit_shr_pat: ; GCN-DAG: s_load_dword s[[SHR:[0-9]+]] diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll index 0e5605961e10c..0c7160df2b96d 100644 --- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -16,8 +16,8 @@ define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, ; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2 ; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3 -; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 -; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 +; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0 +; CHECK: arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0 define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 { %no.md = fdiv float %a, %b store volatile float %no.md, float addrspace(1)* %out @@ -110,15 +110,8 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 ; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1 ; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}} ; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}} - -; CHECK: extractelement <2 x float> %x -; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 -; CHECK: extractelement <2 x float> %x -; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 -; CHECK: store volatile <2 x float> %arcp.25ulp - -; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 -; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 +; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0 +; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !0 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x @@ -146,17 +139,8 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out ; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x ; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x ; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}} - -; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0 -; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0 -; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1 -; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0 -; CHECK: store volatile <2 x float> %arcp.25ulp - -; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0 -; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0 -; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1 -; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0 +; CHECK: %arcp.25ulp = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0 +; CHECK: %fast.25ulp = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x, !fpmath !0 ; CHECK: store volatile <2 x float> %fast.25ulp define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x @@ -179,12 +163,10 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace ; FIXME: Should be able to get fdiv for 1.0 component ; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant( -; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 -; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 +; CHECK: %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0 ; CHECK: store volatile <2 x float> %arcp.25ulp -; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 -; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 +; CHECK: %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0 ; CHECK: store volatile <2 x float> %fast.25ulp define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 { %x.insert = insertelement <2 x float> %x, float 1.0, i32 0 @@ -204,8 +186,8 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> a ; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2 ; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0 ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3 -; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 -; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 +; CHECK: %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0 +; CHECK: %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0 define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { %no.md = fdiv float %a, %b store volatile float %no.md, float addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/and-gcn.ll b/test/CodeGen/AMDGPU/and-gcn.ll index 2aec03aff8a3a..ef11ae87267eb 100644 --- a/test/CodeGen/AMDGPU/and-gcn.ll +++ b/test/CodeGen/AMDGPU/and-gcn.ll @@ -2,8 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_and_i64_br: -; SI: v_and_b32 -; SI: v_and_b32 +; SI: s_and_b64 define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { entry: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll index c356f8b87cfc6..ee0190149e92e 100644 --- a/test/CodeGen/AMDGPU/and.ll +++ b/test/CodeGen/AMDGPU/and.ll @@ -8,8 +8,8 @@ declare i32 @llvm.r600.read.tidig.x() #0 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 @@ -26,10 +26,11 @@ define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspa ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} +; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}} define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 @@ -136,7 +137,9 @@ define amdgpu_kernel void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrs ; FUNC-LABEL: {{^}}v_and_constant_i32 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}} define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %a = load i32, i32 addrspace(1)* %gep, align 4 %and = and i32 %a, 1234567 store i32 %and, i32 addrspace(1)* %out, align 4 ret void @@ -145,7 +148,9 @@ define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrsp ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32 ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}} define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %a = load i32, i32 addrspace(1)* %gep, align 4 %and = and i32 %a, 64 store i32 %and, i32 addrspace(1)* %out, align 4 ret void @@ -154,7 +159,9 @@ define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 a ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32 ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}} define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %a = load i32, i32 addrspace(1)* %gep, align 4 %and = and i32 %a, -16 store i32 %and, i32 addrspace(1)* %out, align 4 ret void @@ -239,8 +246,11 @@ define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out ; SI: v_and_b32 ; SI: v_and_b32 define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.a, align 8 + %gep.b = getelementptr i64, i64 addrspace(1)* %bptr, i32 %tid + %b = load i64, i64 addrspace(1)* %gep.b, align 8 %and = and i64 %a, %b store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -251,7 +261,9 @@ define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* % ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}} ; SI: buffer_store_dwordx2 define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.a, align 8 %and = and i64 %a, 1231231234567 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -299,26 +311,30 @@ define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out } ; FUNC-LABEL: {{^}}v_and_i64_32_bit_constant: -; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; SI-NOT: and ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]] ; SI-NOT: and ; SI: buffer_store_dwordx2 define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.a, align 8 %and = and i64 %a, 1234567 store i64 %and, i64 addrspace(1)* %out, align 8 ret void } ; FUNC-LABEL: {{^}}v_and_inline_imm_i64: -; SI: buffer_load_dword v{{[0-9]+}} +; SI: {{buffer|flat}}_load_dword v{{[0-9]+}} ; SI-NOT: and ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}} ; SI-NOT: and ; SI: buffer_store_dwordx2 define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.a, align 8 %and = and i64 %a, 64 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -326,13 +342,15 @@ define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addr ; FIXME: Should be able to reduce load width ; FUNC-LABEL: {{^}}v_and_inline_neg_imm_i64: -; SI: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} ; SI-NOT: and ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]] ; SI-NOT: and ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}} define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.a, align 8 %and = and i64 %a, -8 store i64 %and, i64 addrspace(1)* %out, align 8 ret void @@ -549,5 +567,4 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1 store i64 %and, i64 addrspace(1)* %out, align 8 ret void } - attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll index c61c23222bc7e..cdc60ab504e01 100644 --- a/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll +++ b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll @@ -2,9 +2,9 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}any_extend_vector_inreg_v16i8_to_v4i32: -; GCN: {{buffer|flat}}_load_dwordx4 -; GCN-DAG: {{buffer|flat}}_load_dwordx4 -; GCN-DAG: {{buffer|flat}}_load_dword +; GCN: s_load_dwordx4 +; GCN-DAG: s_load_dwordx4 +; GCN-DAG: s_load_dword ; GCN: {{buffer|flat}}_store_byte ; GCN: {{buffer|flat}}_store_byte diff --git a/test/CodeGen/AMDGPU/bitreverse.ll b/test/CodeGen/AMDGPU/bitreverse.ll index 539373f7bdeb4..f29bfb46b94bd 100644 --- a/test/CodeGen/AMDGPU/bitreverse.ll +++ b/test/CodeGen/AMDGPU/bitreverse.ll @@ -2,6 +2,8 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s +declare i32 @llvm.amdgcn.workitem.id.x() #1 + declare i16 @llvm.bitreverse.i16(i16) #1 declare i32 @llvm.bitreverse.i32(i32) #1 declare i64 @llvm.bitreverse.i64(i64) #1 @@ -42,12 +44,14 @@ define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) } ; FUNC-LABEL: {{^}}v_brev_i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %gep %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 store i32 %brev, i32 addrspace(1)* %out ret void @@ -66,7 +70,9 @@ define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 ; SI: v_bfrev_b32_e32 ; SI: v_bfrev_b32_e32 define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out ret void @@ -82,7 +88,9 @@ define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) ; FUNC-LABEL: {{^}}v_brev_i64: ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0 define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { - %val = load i64, i64 addrspace(1)* %valptr + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid + %val = load i64, i64 addrspace(1)* %gep %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 store i64 %brev, i64 addrspace(1)* %out ret void @@ -97,7 +105,9 @@ define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; FUNC-LABEL: {{^}}v_brev_v2i64: define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { - %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid + %val = load <2 x i64>, <2 x i64> addrspace(1)* %gep %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out ret void diff --git a/test/CodeGen/AMDGPU/bswap.ll b/test/CodeGen/AMDGPU/bswap.ll index d2dacd7c17b3f..eb3fc2fab34fd 100644 --- a/test/CodeGen/AMDGPU/bswap.ll +++ b/test/CodeGen/AMDGPU/bswap.ll @@ -10,7 +10,7 @@ declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone ; FUNC-LABEL: @test_bswap_i32 -; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI: s_load_dword [[VAL:s[0-9]+]] ; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8 ; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll index 5dec3e35ab3d0..c114332a58872 100644 --- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll +++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll @@ -1,9 +1,9 @@ ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-CIVI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-CIVI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; OPT-LABEL: @test_no_sink_flat_small_offset_i32( ; OPT-CIVI: getelementptr i32, i32 addrspace(4)* %in @@ -40,7 +40,7 @@ done: ; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32( ; OPT: getelementptr i32, i32 addrspace(4)* %out, -; OPT-CI-NOT: getelementptr +; rOPT-CI-NOT: getelementptr ; OPT: br i1 ; OPT-CI: addrspacecast diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll index c1cf56e5058ec..c01d834bc33d6 100644 --- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -1,9 +1,9 @@ ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" diff --git a/test/CodeGen/AMDGPU/clamp-omod-special-case.mir b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir index 6ecf75c1acec3..90fba03420901 100644 --- a/test/CodeGen/AMDGPU/clamp-omod-special-case.mir +++ b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir @@ -1,36 +1,4 @@ # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fold-operands %s -o - | FileCheck -check-prefix=GCN %s ---- | - define amdgpu_ps void @v_max_self_clamp_not_set_f32() #0 { - ret void - } - - define amdgpu_ps void @v_clamp_omod_already_set_f32() #0 { - ret void - } - - define amdgpu_ps void @v_omod_mul_omod_already_set_f32() #0 { - ret void - } - - define amdgpu_ps void @v_omod_mul_clamp_already_set_f32() #0 { - ret void - } - - define amdgpu_ps void @v_omod_add_omod_already_set_f32() #0 { - ret void - } - - define amdgpu_ps void @v_omod_add_clamp_already_set_f32() #0 { - ret void - } - - define amdgpu_ps void @v_max_reg_imm_f32() #0 { - ret void - } - - attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" } - -... --- # GCN-LABEL: name: v_max_self_clamp_not_set_f32 # GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec @@ -70,7 +38,7 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 @@ -132,7 +100,7 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 @@ -195,7 +163,7 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 @@ -260,7 +228,7 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 @@ -337,7 +305,7 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 @@ -402,7 +370,7 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 @@ -435,7 +403,7 @@ registers: - { id: 0, class: vgpr_32 } - { id: 1, class: vgpr_32 } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %vgpr0 %0 = COPY %vgpr0 diff --git a/test/CodeGen/AMDGPU/coalescer_remat.ll b/test/CodeGen/AMDGPU/coalescer_remat.ll index 3e1b76a1df094..14b798ba822b7 100644 --- a/test/CodeGen/AMDGPU/coalescer_remat.ll +++ b/test/CodeGen/AMDGPU/coalescer_remat.ll @@ -12,7 +12,7 @@ declare float @llvm.fma.f32(float, float, float) ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; It's probably OK if this is slightly higher: -; CHECK: ; NumVgprs: 8 +; CHECK: ; NumVgprs: 4 define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) { entry: %cmpflag = icmp eq i32 %flag, 1 diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir index ed78ccc9b617c..0401f7b07e218 100644 --- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -1,84 +1,5 @@ # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s ---- | - define amdgpu_kernel void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { - %and = and i32 %a, 1234567 - store volatile i32 %and, i32 addrspace(1)* %out - ret void - } - - define amdgpu_kernel void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %idxprom = sext i32 %tid to i64 - %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom - %a = load i32, i32 addrspace(1)* %gep.a - %and = and i32 %a, 1234567 - store i32 %and, i32 addrspace(1)* %gep.out - ret void - } - - define amdgpu_kernel void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { - %shl = shl i32 %a, 12 - store volatile i32 %shl, i32 addrspace(1)* %out - ret void - } - - define amdgpu_kernel void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %idxprom = sext i32 %tid to i64 - %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom - %a = load i32, i32 addrspace(1)* %gep.a - %shl = shl i32 %a, 12 - store i32 %shl, i32 addrspace(1)* %gep.out - ret void - } - - define amdgpu_kernel void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { - %ashr = ashr i32 %a, 12 - store volatile i32 %ashr, i32 addrspace(1)* %out - ret void - } - - define amdgpu_kernel void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %idxprom = sext i32 %tid to i64 - %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom - %a = load i32, i32 addrspace(1)* %gep.a - %ashr = ashr i32 %a, 12 - store i32 %ashr, i32 addrspace(1)* %gep.out - ret void - } - - define amdgpu_kernel void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { - %lshr = lshr i32 %a, 12 - store volatile i32 %lshr, i32 addrspace(1)* %out - ret void - } - - define amdgpu_kernel void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %idxprom = sext i32 %tid to i64 - %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom - %a = load i32, i32 addrspace(1)* %gep.a - %lshr = lshr i32 %a, 12 - store i32 %lshr, i32 addrspace(1)* %gep.out - ret void - } - - define amdgpu_kernel void @undefined_vreg_operand() { - unreachable - } - - declare i32 @llvm.amdgcn.workitem.id.x() #1 - - attributes #0 = { nounwind } - attributes #1 = { nounwind readnone } - ... ---- # GCN-LABEL: name: s_fold_and_imm_regimm_32{{$}} # GCN: %10 = V_MOV_B32_e32 1543, implicit %exec @@ -119,11 +40,11 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1 %0 = COPY %sgpr0_sgpr1 - %1 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %1 = S_LOAD_DWORDX2_IMM %0, 36, 0 %2 = COPY %1.sub1 %3 = COPY %1.sub0 %4 = S_MOV_B32 61440 @@ -133,7 +54,7 @@ body: | %8 = S_MOV_B32 9999 %9 = S_AND_B32 killed %7, killed %8, implicit-def dead %scc %10 = COPY %9 - BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... @@ -204,12 +125,12 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 %31 = V_ASHRREV_I32_e64 31, %3, implicit %exec %32 = REG_SEQUENCE %3, 1, %31, 2 %33 = V_LSHLREV_B64 2, killed %32, implicit %exec @@ -223,19 +144,19 @@ body: | %34 = V_MOV_B32_e32 63, implicit %exec %27 = V_AND_B32_e64 %26, %24, implicit %exec - FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr %28 = V_AND_B32_e64 %24, %26, implicit %exec - FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr %29 = V_AND_B32_e32 %26, %24, implicit %exec - FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr %30 = V_AND_B32_e64 %26, %26, implicit %exec - FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr %31 = V_AND_B32_e64 %34, %34, implicit %exec - FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr S_ENDPGM @@ -285,11 +206,11 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 %5 = S_MOV_B32 1 %6 = COPY %4.sub1 %7 = COPY %4.sub0 @@ -298,7 +219,7 @@ body: | %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 %12 = S_LSHL_B32 killed %5, 12, implicit-def dead %scc %13 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... @@ -390,7 +311,7 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %2 = COPY %vgpr0 @@ -411,34 +332,34 @@ body: | %27 = S_MOV_B32 -4 %11 = V_LSHLREV_B32_e64 12, %10, implicit %exec - FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr %12 = V_LSHLREV_B32_e64 %7, 12, implicit %exec - FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr %13 = V_LSHL_B32_e64 %7, 12, implicit %exec - FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr %14 = V_LSHL_B32_e64 12, %7, implicit %exec - FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr %15 = V_LSHL_B32_e64 12, %24, implicit %exec - FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr %22 = V_LSHL_B32_e64 %6, 12, implicit %exec - FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr %23 = V_LSHL_B32_e64 %6, 32, implicit %exec - FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr %25 = V_LSHL_B32_e32 %6, %6, implicit %exec - FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr %26 = V_LSHLREV_B32_e32 11, %24, implicit %exec - FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr %28 = V_LSHL_B32_e32 %27, %6, implicit %exec - FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr S_ENDPGM @@ -485,11 +406,11 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 %5 = S_MOV_B32 999123 %6 = COPY %4.sub1 %7 = COPY %4.sub0 @@ -498,7 +419,7 @@ body: | %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 %12 = S_ASHR_I32 killed %5, 12, implicit-def dead %scc %13 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... @@ -593,12 +514,12 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %2 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec %16 = REG_SEQUENCE %2, 1, %15, 2 %17 = V_LSHLREV_B64 2, killed %16, implicit %exec @@ -619,34 +540,34 @@ body: | %35 = V_MOV_B32_e32 2, implicit %exec %11 = V_ASHRREV_I32_e64 8, %10, implicit %exec - FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr %12 = V_ASHRREV_I32_e64 %8, %10, implicit %exec - FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr %13 = V_ASHR_I32_e64 %7, 3, implicit %exec - FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr %14 = V_ASHR_I32_e64 7, %32, implicit %exec - FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr %15 = V_ASHR_I32_e64 %27, %24, implicit %exec - FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr %22 = V_ASHR_I32_e64 %6, 4, implicit %exec - FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr %23 = V_ASHR_I32_e64 %6, %33, implicit %exec - FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr %25 = V_ASHR_I32_e32 %34, %34, implicit %exec - FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr %26 = V_ASHRREV_I32_e32 11, %10, implicit %exec - FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr %28 = V_ASHR_I32_e32 %27, %35, implicit %exec - FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr S_ENDPGM @@ -693,11 +614,11 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 %5 = S_MOV_B32 -999123 %6 = COPY %4.sub1 %7 = COPY %4.sub0 @@ -706,7 +627,7 @@ body: | %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 %12 = S_LSHR_B32 killed %5, 12, implicit-def dead %scc %13 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... @@ -802,12 +723,12 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %2 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec %16 = REG_SEQUENCE %2, 1, %15, 2 %17 = V_LSHLREV_B64 2, killed %16, implicit %exec @@ -828,34 +749,34 @@ body: | %35 = V_MOV_B32_e32 2, implicit %exec %11 = V_LSHRREV_B32_e64 8, %10, implicit %exec - FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr %12 = V_LSHRREV_B32_e64 %8, %10, implicit %exec - FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr %13 = V_LSHR_B32_e64 %7, 3, implicit %exec - FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr %14 = V_LSHR_B32_e64 7, %32, implicit %exec - FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr %15 = V_LSHR_B32_e64 %27, %24, implicit %exec - FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr %22 = V_LSHR_B32_e64 %6, 4, implicit %exec - FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr %23 = V_LSHR_B32_e64 %6, %33, implicit %exec - FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr %25 = V_LSHR_B32_e32 %34, %34, implicit %exec - FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr %26 = V_LSHRREV_B32_e32 11, %10, implicit %exec - FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr %28 = V_LSHR_B32_e32 %27, %35, implicit %exec - FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr S_ENDPGM diff --git a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll index 8611cd080e15d..09d4b2c8bd774 100644 --- a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll +++ b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll @@ -107,7 +107,7 @@ define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) { ; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}} ; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}} ; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]] -; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]] +; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]], v[[VREG1_LO]] ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) { diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll index d772d1b679369..e39bd60a1cc88 100644 --- a/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -5,35 +5,41 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone ; FUNC-LABEL: {{^}}test_copy_v4i8: -; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}test_copy_v4i8_x2: -; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 ret void } ; FUNC-LABEL: {{^}}test_copy_v4i8_x3: -; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 @@ -41,14 +47,16 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x } ; FUNC-LABEL: {{^}}test_copy_v4i8_x4: -; GCN: buffer_load_dword [[REG:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: buffer_store_dword [[REG]] ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 @@ -57,7 +65,7 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x } ; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN-DAG: v_lshrrev_b32 ; GCN: v_and_b32 ; GCN: v_or_b32 @@ -66,7 +74,9 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9> store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 @@ -97,19 +107,21 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o } ; FUNC-LABEL: {{^}}test_copy_v3i8_align4: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { - %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 + %tid.x = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x + %val = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}test_copy_v3i8_align2: -; GCN-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; GCN-DAG: {{buffer|flat}}_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: {{buffer|flat}}_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; GCN: s_endpgm @@ -120,9 +132,9 @@ define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 } ; FUNC-LABEL: {{^}}test_copy_v3i8_align1: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte @@ -135,10 +147,10 @@ define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 } ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte ; GCN: buffer_store_dword ; GCN: s_endpgm define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { @@ -148,10 +160,10 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* % } ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte +; GCN: {{buffer|flat}}_load_ubyte ; GCN: buffer_store_byte ; GCN: buffer_store_byte ; GCN: buffer_store_byte diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll index 149c50685b1db..a544cbe890b50 100644 --- a/test/CodeGen/AMDGPU/ctlz.ll +++ b/test/CodeGen/AMDGPU/ctlz.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone @@ -34,9 +34,9 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) } ; FUNC-LABEL: {{^}}v_ctlz_i32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]] -; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[CTLZ]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], +; GCN: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]] +; GCN: v_cmp_ne_u32_e32 vcc, 0, [[VAL]] ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 32, [[CTLZ]], vcc ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -44,14 +44,16 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) ; EG: FFBH_UINT ; EG: CNDE_INT define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}v_ctlz_v2i32: -; GCN: buffer_load_dwordx2 +; GCN: {{buffer|flat}}_load_dwordx2 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 ; GCN: buffer_store_dwordx2 @@ -62,14 +64,16 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp ; EG: FFBH_UINT ; EG: CNDE_INT define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 ret void } ; FUNC-LABEL: {{^}}v_ctlz_v4i32: -; GCN: buffer_load_dwordx4 +; GCN: {{buffer|flat}}_load_dwordx4 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 @@ -90,16 +94,25 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 ; EG-DAG: FFBH_UINT ; EG-DAG: CNDE_INT define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 ret void } ; FUNC-LABEL: {{^}}v_ctlz_i8: -; GCN: buffer_load_ubyte [[VAL:v[0-9]+]], -; SI-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; VI-DAG: v_ffbh_u32_sdwa [[RESULT:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]], +; SI-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] +; VI-DAG: v_ffbh_u32_sdwa [[FFBH:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; SI: v_cmp_ne_u32_e32 vcc, 0, [[VAL]] +; VI: v_cmp_ne_u16_e32 vcc, 0, [[VAL]] + +; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 32, [[FFBH]], vcc + +; SI: v_subrev_i32_e32 [[RESULT:v[0-9]+]], vcc, 24, [[SELECT]] +; VI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, -16, [[SELECT]] ; GCN: buffer_store_byte [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { @@ -136,12 +149,12 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 ; FUNC-LABEL: {{^}}v_ctlz_i64: ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]] +; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, v[[HI]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]] ; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]] -; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]] -; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]] +; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], vcc +; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[LO]], v[[HI]] ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]] ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI:[0-9]+]]{{\]}} @@ -168,12 +181,14 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 } ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_neg1: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm - define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr +define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp eq i32 %val, 0 %sel = select i1 %cmp, i32 -1, i32 %ctlz @@ -182,12 +197,14 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 } ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_neg1: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp ne i32 %val, 0 %sel = select i1 %cmp, i32 %ctlz, i32 -1 @@ -197,13 +214,15 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out ; TODO: Should be able to eliminate select here as well. ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_bitwidth: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: s_endpgm define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp eq i32 %ctlz, 32 %sel = select i1 %cmp, i32 -1, i32 %ctlz @@ -212,13 +231,15 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias } ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_bitwidth: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: s_endpgm define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone %cmp = icmp ne i32 %ctlz, 32 %sel = select i1 %cmp, i32 %ctlz, i32 -1 @@ -242,7 +263,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias } ; FUNC-LABEL: {{^}}v_ctlz_i16_sel_eq_neg1: -; SI: buffer_load_ushort [[VAL:v[0-9]+]], +; SI: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]], ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]] ; SI: buffer_store_short [[FFBH]], define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 48f3e4401f1a8..7500da536307f 100644 --- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -29,21 +29,23 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32: -; GCN: buffer_load_dwordx2 +; GCN: {{buffer|flat}}_load_dwordx2 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 ; GCN: buffer_store_dwordx2 @@ -52,14 +54,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, ; EG: FFBH_UINT {{\*? *}}[[RESULT]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 ret void } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32: -; GCN: buffer_load_dwordx4 +; GCN: {{buffer|flat}}_load_dwordx4 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 ; GCN: v_ffbh_u32_e32 @@ -72,18 +76,22 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noali ; EG: FFBH_UINT {{\*? *}}[[RESULT]] ; EG: FFBH_UINT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 ret void } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8: -; GCN: buffer_load_ubyte [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_byte [[RESULT]], define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { - %val = load i8, i8 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid + %val = load i8, i8 addrspace(1)* %in.gep %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone store i8 %ctlz, i8 addrspace(1)* %out ret void @@ -116,11 +124,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64: ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]] +; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, v[[HI]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]] ; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]] -; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]] +; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]] ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI:[0-9]+]]{{\]}} define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() @@ -145,11 +153,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], - define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 0 %sel = select i1 %cmp, i32 -1, i32 %ctlz @@ -158,11 +168,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_neg1: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp ne i32 %val, 0 %sel = select i1 %cmp, i32 %ctlz, i32 -1 @@ -186,15 +198,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa } ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: v_ffbh_u32_e32 [[RESULT0:v[0-9]+]], [[VAL]] ; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, [[VAL]] ; GCN-DAG: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 0, 1, vcc ; GCN-DAG: buffer_store_dword [[RESULT0]] ; GCN-DAG: buffer_store_byte [[RESULT1]] ; GCN: s_endpgm - define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 0 %sel = select i1 %cmp, i32 -1, i32 %ctlz @@ -205,13 +219,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa ; Selected on wrong constant ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_0: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword - define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 0 %sel = select i1 %cmp, i32 0, i32 %ctlz @@ -221,13 +237,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noa ; Selected on wrong constant ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_0: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp ne i32 %val, 0 %sel = select i1 %cmp, i32 %ctlz, i32 0 @@ -237,13 +255,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noal ; Compare on wrong constant ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_cmp_non0: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword - define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr +define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp eq i32 %val, 1 %sel = select i1 %cmp, i32 0, i32 %ctlz @@ -253,13 +273,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noal ; Selected on wrong constant ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_cmp_non0: -; GCN: buffer_load_dword +; GCN: {{buffer|flat}}_load_dword ; GCN: v_ffbh_u32_e32 ; GCN: v_cmp ; GCN: v_cndmask ; GCN: buffer_store_dword define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone %cmp = icmp ne i32 %val, 1 %sel = select i1 %cmp, i32 %ctlz, i32 0 diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll index aa913ad406d2b..68b39bad2bc12 100644 --- a/test/CodeGen/AMDGPU/ctpop.ll +++ b/test/CodeGen/AMDGPU/ctpop.ll @@ -8,6 +8,8 @@ declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + ; FUNC-LABEL: {{^}}s_ctpop_i32: ; GCN: s_load_dword [[SVAL:s[0-9]+]], ; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]] @@ -24,22 +26,24 @@ define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) ; XXX - Why 0 in register? ; FUNC-LABEL: {{^}}v_ctpop_i32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 0 ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone store i32 %ctpop, i32 addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32: -; GCN: buffer_load_dword [[VAL1:v[0-9]+]], -; GCN: buffer_load_dword [[VAL0:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] @@ -49,8 +53,11 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs ; EG: BCNT_INT ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind { - %val0 = load i32, i32 addrspace(1)* %in0, align 4 - %val1 = load i32, i32 addrspace(1)* %in1, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid + %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid + %val0 = load i32, i32 addrspace(1)* %in0.gep, align 4 + %val1 = load i32, i32 addrspace(1)* %in1.gep, align 4 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone %add = add i32 %ctpop0, %ctpop1 @@ -59,15 +66,17 @@ define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, } ; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32: -; GCN: buffer_load_dword [[VAL0:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]], ; GCN: s_waitcnt ; GCN-NEXT: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}} ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm -define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind { - %val0 = load i32, i32 addrspace(1)* %in0, align 4 - %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone - %add = add i32 %ctpop0, %sval +define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %sval) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + %add = add i32 %ctpop, %sval store i32 %add, i32 addrspace(1)* %out, align 4 ret void } @@ -80,7 +89,9 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, ; EG: BCNT_INT ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8 ret void @@ -98,7 +109,9 @@ define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, < ; EG: BCNT_INT ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16 ret void @@ -124,7 +137,9 @@ define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, < ; EG: BCNT_INT ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <8 x i32>, <8 x i32> addrspace(1)* %in, i32 %tid + %val = load <8 x i32>, <8 x i32> addrspace(1)* %in.gep, align 32 %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32 ret void @@ -166,21 +181,25 @@ define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, < ; EG: BCNT_INT ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <16 x i32>, <16 x i32> addrspace(1)* %in, i32 %tid + %val = load <16 x i32>, <16 x i32> addrspace(1)* %in.gep, align 32 %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32 ret void } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4 ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %ctpop, 4 store i32 %add, i32 addrspace(1)* %out, align 4 @@ -188,14 +207,16 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noa } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4 ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 4, %ctpop store i32 %add, i32 addrspace(1)* %out, align 4 @@ -203,14 +224,16 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %ctpop, 99999 store i32 %add, i32 addrspace(1)* %out, align 4 @@ -218,7 +241,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %ou } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], @@ -226,7 +249,9 @@ define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %ou ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %ctpop, %const store i32 %add, i32 addrspace(1)* %out, align 4 @@ -234,7 +259,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], @@ -242,7 +267,9 @@ define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone %add = add i32 %const, %ctpop store i32 %add, i32 addrspace(1)* %out, align 4 @@ -250,18 +277,22 @@ define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %ou } ; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], {{0$}} -; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16 -; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; SI: buffer_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 +; SI: buffer_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 +; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]] +; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] +; VI: flat_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm ; EG: BCNT_INT define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4 + %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 %tid %const = load i32, i32 addrspace(1)* %gep, align 4 %add = add i32 %const, %ctpop store i32 %add, i32 addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll index f18bd9fd8174b..4850370851f63 100644 --- a/test/CodeGen/AMDGPU/ctpop64.ll +++ b/test/CodeGen/AMDGPU/ctpop64.ll @@ -1,6 +1,8 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + declare i64 @llvm.ctpop.i64(i64) nounwind readnone declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone @@ -25,14 +27,16 @@ define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) } ; FUNC-LABEL: {{^}}v_ctpop_i64: -; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, +; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { - %val = load i64, i64 addrspace(1)* %in, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %val = load i64, i64 addrspace(1)* %in.gep, align 8 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone %truncctpop = trunc i64 %ctpop to i32 store i32 %truncctpop, i32 addrspace(1)* %out, align 4 @@ -40,7 +44,7 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs } ; FUNC-LABEL: {{^}}v_ctpop_i64_user: -; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, +; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] @@ -49,7 +53,9 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind { - %val = load i64, i64 addrspace(1)* %in, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %val = load i64, i64 addrspace(1)* %in.gep, align 8 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone %or = or i64 %ctpop, %s.val store i64 %or, i64 addrspace(1)* %out @@ -87,7 +93,9 @@ define amdgpu_kernel void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, < ; GCN: v_bcnt_u32_b32 ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind { - %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i32 %tid + %val = load <2 x i64>, <2 x i64> addrspace(1)* %in.gep, align 16 %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8 @@ -105,7 +113,9 @@ define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, < ; GCN: v_bcnt_u32_b32 ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind { - %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid + %val = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep, align 32 %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 @@ -169,7 +179,8 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) ; FIXME: Should not have extra add ; FUNC-LABEL: {{^}}v_ctpop_i128: -; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 +; VI: flat_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}} ; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0 ; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]] @@ -182,7 +193,9 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind { - %val = load i128, i128 addrspace(1)* %in, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %tid + %val = load i128, i128 addrspace(1)* %in.gep, align 8 %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone %truncctpop = trunc i128 %ctpop to i32 store i32 %truncctpop, i32 addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 1fa6407647eb8..1bfd38d94bfdf 100644 --- a/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -5,6 +5,7 @@ declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone +declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32: ; SI: s_load_dword [[VAL:s[0-9]+]], @@ -21,21 +22,23 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, } ; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]], ; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] ; SI: buffer_store_dword [[RESULT]], ; SI: s_endpgm ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid + %val = load i32, i32 addrspace(1)* %in.gep, align 4 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone store i32 %cttz, i32 addrspace(1)* %out, align 4 ret void } ; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32: -; SI: buffer_load_dwordx2 +; SI: {{buffer|flat}}_load_dwordx2 ; SI: v_ffbl_b32_e32 ; SI: v_ffbl_b32_e32 ; SI: buffer_store_dwordx2 @@ -44,14 +47,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, ; EG: FFBL_INT {{\*? *}}[[RESULT]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8 ret void } ; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32: -; SI: buffer_load_dwordx4 +; SI: {{buffer|flat}}_load_dwordx4 ; SI: v_ffbl_b32_e32 ; SI: v_ffbl_b32_e32 ; SI: v_ffbl_b32_e32 @@ -64,7 +69,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noali ; EG: FFBL_INT {{\*? *}}[[RESULT]] ; EG: FFBL_INT {{\*? *}}[[RESULT]] define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 + %tid = call i32 @llvm.r600.read.tidig.x() + %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16 ret void diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 0328ce31002df..f839129fc3d87 100644 --- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -5,46 +5,52 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone ; GCN-LABEL: {{^}}load_i8_to_f32: -; GCN: buffer_load_ubyte [[LOADREG:v[0-9]+]], +; GCN: {{buffer|flat}}_load_ubyte [[LOADREG:v[0-9]+]], ; GCN-NOT: bfe ; GCN-NOT: lshr ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]] ; GCN: buffer_store_dword [[CONV]], define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { - %load = load i8, i8 addrspace(1)* %in, align 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid + %load = load i8, i8 addrspace(1)* %gep, align 1 %cvt = uitofp i8 %load to float store float %cvt, float addrspace(1)* %out, align 4 ret void } ; GCN-LABEL: {{^}}load_v2i8_to_v2f32: -; GCN: buffer_load_ushort [[LD:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort [[LD:v[0-9]+]] ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid + %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2 %cvt = uitofp <2 x i8> %load to <2 x float> store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}load_v3i8_to_v3f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: v_cvt_f32_ubyte3_e32 ; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]] ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid + %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 %cvt = uitofp <3 x i8> %load to <3 x float> store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}load_v4i8_to_v4f32: -; GCN: buffer_load_dword [[LOADREG:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]] ; GCN-NOT: bfe ; GCN-NOT: lshr ; GCN-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] @@ -53,7 +59,9 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid + %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 %cvt = uitofp <4 x i8> %load to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 ret void @@ -64,10 +72,10 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias ; FIXME: Packing bytes ; GCN-LABEL: {{^}}load_v4i8_to_v4f32_unaligned: -; GCN: buffer_load_ubyte [[LOADREG3:v[0-9]+]] -; GCN: buffer_load_ubyte [[LOADREG2:v[0-9]+]] -; GCN: buffer_load_ubyte [[LOADREG1:v[0-9]+]] -; GCN: buffer_load_ubyte [[LOADREG0:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ubyte [[LOADREG3:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ubyte [[LOADREG2:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ubyte [[LOADREG1:v[0-9]+]] +; GCN: {{buffer|flat}}_load_ubyte [[LOADREG0:v[0-9]+]] ; GCN-DAG: v_lshlrev_b32 ; GCN-DAG: v_or_b32 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], @@ -77,7 +85,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid + %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 %cvt = uitofp <4 x i8> %load to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 ret void @@ -124,14 +134,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n ; GCN-LABEL: {{^}}load_v7i8_to_v7f32: ; GCN: s_endpgm define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid + %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1 %cvt = uitofp <7 x i8> %load to <7 x float> store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}load_v8i8_to_v8f32: -; GCN: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}}, +; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}}, ; GCN-NOT: bfe ; GCN-NOT: lshr ; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]] @@ -147,19 +159,23 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid + %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8 %cvt = uitofp <8 x i8> %load to <8 x float> store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 ret void } ; GCN-LABEL: {{^}}i8_zext_inreg_i32_to_f32: -; GCN: buffer_load_dword [[LOADREG:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]], ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]] ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]] ; GCN: buffer_store_dword [[CONV]], define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %load = load i32, i32 addrspace(1)* %gep, align 4 %add = add i32 %load, 2 %inreg = and i32 %add, 255 %cvt = uitofp i32 %inreg to float @@ -169,7 +185,9 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32: define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %load = load i32, i32 addrspace(1)* %gep, align 4 %inreg = and i32 %load, 65280 %shr = lshr i32 %inreg, 8 %cvt = uitofp i32 %shr to float @@ -181,7 +199,9 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias ; them so it shouldn't really matter. ; GCN-LABEL: {{^}}i8_zext_i32_to_f32: define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { - %load = load i8, i8 addrspace(1)* %in, align 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid + %load = load i8, i8 addrspace(1)* %gep, align 1 %ext = zext i8 %load to i32 %cvt = uitofp i32 %ext to float store float %cvt, float addrspace(1)* %out, align 4 @@ -190,7 +210,9 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32: define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid + %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 %ext = zext <4 x i8> %load to <4 x i32> %cvt = uitofp <4 x i32> %ext to <4 x float> store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 @@ -198,12 +220,14 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no } ; GCN-LABEL: {{^}}extract_byte0_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %gep %and = and i32 %val, 255 %cvt = uitofp i32 %and to float store float %cvt, float addrspace(1)* %out @@ -211,12 +235,14 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out } ; GCN-LABEL: {{^}}extract_byte1_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %gep %srl = lshr i32 %val, 8 %and = and i32 %srl, 255 %cvt = uitofp i32 %and to float @@ -225,12 +251,14 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out } ; GCN-LABEL: {{^}}extract_byte2_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %gep %srl = lshr i32 %val, 16 %and = and i32 %srl, 255 %cvt = uitofp i32 %and to float @@ -239,12 +267,14 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out } ; GCN-LABEL: {{^}}extract_byte3_to_f32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: [[VAL]] ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[CONV]] define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %gep %srl = lshr i32 %val, 24 %and = and i32 %srl, 255 %cvt = uitofp i32 %and to float diff --git a/test/CodeGen/AMDGPU/detect-dead-lanes.mir b/test/CodeGen/AMDGPU/detect-dead-lanes.mir index 3148b9b8ff9db..c265b8e2ad2ea 100644 --- a/test/CodeGen/AMDGPU/detect-dead-lanes.mir +++ b/test/CodeGen/AMDGPU/detect-dead-lanes.mir @@ -1,14 +1,4 @@ # RUN: llc -march=amdgcn -run-pass detect-dead-lanes -o - %s | FileCheck %s ---- | - define amdgpu_kernel void @test0() { ret void } - define amdgpu_kernel void @test1() { ret void } - define amdgpu_kernel void @test2() { ret void } - define amdgpu_kernel void @test3() { ret void } - define amdgpu_kernel void @test4() { ret void } - define amdgpu_kernel void @test5() { ret void } - define amdgpu_kernel void @loop0() { ret void } - define amdgpu_kernel void @loop1() { ret void } - define amdgpu_kernel void @loop2() { ret void } ... --- # Combined use/def transfer check, the basics. diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll index 2c474dbe7b086..deb90df99dcf4 100644 --- a/test/CodeGen/AMDGPU/ds_read2.ll +++ b/test/CodeGen/AMDGPU/ds_read2.ll @@ -9,7 +9,7 @@ ; SI-LABEL: @simple_read2_f32 ; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8 ; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 { @@ -28,7 +28,7 @@ define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 { ; SI-LABEL: @simple_read2_f32_max_offset ; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255 ; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll index 3dfdaf3936a64..ef4efc6336ce1 100644 --- a/test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -38,9 +38,9 @@ define amdgpu_kernel void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Z:[0-9]+]]:[[REG_W:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[REG_Y]] -; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]] +; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_X]], v[[REG_Z]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[REG_W]] +; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD0]], v[[ADD1]] ; CI: buffer_store_dword v[[ADD2]] ; CI: s_endpgm define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 { @@ -64,8 +64,8 @@ define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1) ; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} ; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}} -; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]] +; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_X]], v[[REG_Z]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[ADD0]], v[[REG_Y]] ; CI: buffer_store_dword v[[ADD1]] ; CI: s_endpgm define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 { diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll index 81b35a46aa188..b1fba8c240d7c 100644 --- a/test/CodeGen/AMDGPU/ds_read2st64.ll +++ b/test/CodeGen/AMDGPU/ds_read2st64.ll @@ -7,7 +7,7 @@ ; SI-LABEL: @simple_read2st64_f32_0_1 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 ; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 { @@ -26,7 +26,7 @@ define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 ; SI-LABEL: @simple_read2st64_f32_1_2 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 ; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { @@ -46,7 +46,7 @@ define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, fl ; SI-LABEL: @simple_read2st64_f32_max_offset ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255 ; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { diff --git a/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/test/CodeGen/AMDGPU/early-if-convert-cost.ll index ace01593808b7..74404989f8c71 100644 --- a/test/CodeGen/AMDGPU/early-if-convert-cost.ll +++ b/test/CodeGen/AMDGPU/early-if-convert-cost.ll @@ -1,4 +1,4 @@ -; RUN: llc -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; FIXME: Most of these cases that don't trigger because of broken cost ; heuristics. Should not need -stress-early-ifcvt diff --git a/test/CodeGen/AMDGPU/early-if-convert.ll b/test/CodeGen/AMDGPU/early-if-convert.ll index 9439130deb9ef..792f0b1eaef46 100644 --- a/test/CodeGen/AMDGPU/early-if-convert.ll +++ b/test/CodeGen/AMDGPU/early-if-convert.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; FIXME: This leaves behind a now unnecessary and with exec diff --git a/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll index 6eb1fc1d0cc29..b7dfcd99029a0 100644 --- a/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll +++ b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll @@ -2,16 +2,21 @@ ; RUN: llc -march=amdgcn -enable-no-signed-zeros-fp-math=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s ; RUN: llc -march=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + ; Test that the -enable-no-signed-zeros-fp-math flag works ; GCN-LABEL: {{^}}fneg_fsub_f32: -; GCN: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GCN: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]] ; GCN-UNSAFE-NOT: xor define amdgpu_kernel void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 - %a = load float, float addrspace(1)* %in, align 4 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %add = add i32 %tid, 1 + %gep = getelementptr float, float addrspace(1)* %in, i32 %tid + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 %add + %a = load float, float addrspace(1)* %gep, align 4 %b = load float, float addrspace(1)* %b_ptr, align 4 %result = fsub float %a, %b %neg.result = fsub float -0.0, %result diff --git a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll index 34999fa3aea43..3fb452de1ccf4 100644 --- a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll +++ b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll @@ -1,5 +1,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + ; Make sure the add and load are reduced to 32-bits even with the ; bitcast to vector. ; GCN-LABEL: {{^}}bitcast_int_to_vector_extract_0: @@ -8,7 +10,9 @@ ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]] ; GCN: buffer_store_dword [[ADD]] define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { - %a = load i64, i64 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %a = load i64, i64 addrspace(1)* %gep %add = add i64 %a, %b %val.bc = bitcast i64 %add to <2 x i32> %extract = extractelement <2 x i32> %val.bc, i32 0 @@ -21,7 +25,9 @@ define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %ou ; GCN: v_add_f64 ; GCN: buffer_store_dword v define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) { - %a = load double, double addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + %a = load double, double addrspace(1)* %gep %add = fadd double %a, %b %val.bc = bitcast double %add to <2 x i32> %extract = extractelement <2 x i32> %val.bc, i32 0 @@ -34,7 +40,9 @@ define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out ; GCN: v_add_i32 ; GCN: buffer_store_dword define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { - %a = load i64, i64 addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %a = load i64, i64 addrspace(1)* %gep %add = add i64 %a, %b %val.bc = bitcast i64 %add to <2 x float> %extract = extractelement <2 x float> %val.bc, i32 0 diff --git a/test/CodeGen/AMDGPU/fabs.f16.ll b/test/CodeGen/AMDGPU/fabs.f16.ll index 4e2ec4b3054fe..d56d5ec1411a9 100644 --- a/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/test/CodeGen/AMDGPU/fabs.f16.ll @@ -39,9 +39,9 @@ define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) { ; VI: flat_load_ushort [[HI:v[0-9]+]] ; VI: flat_load_ushort [[LO:v[0-9]+]] ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} -; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[HI]] +; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[HI]], [[MASK]] ; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[LO]], [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_HI]], [[FABS_LO]] +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_LO]], [[FABS_HI]] ; VI: flat_store_dword ; GFX9: s_load_dword [[VAL:s[0-9]+]] @@ -62,8 +62,8 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] +; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] ; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -80,7 +80,7 @@ define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half ; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]] ; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]| -; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[CVT0]], [[ABS_CVT1]] +; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[ABS_CVT1]], [[CVT0]] ; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]] @@ -134,7 +134,9 @@ define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %i ; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]] ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}} define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { - %val = load <2 x half>, <2 x half> addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) %fmul = fmul <2 x half> %fabs, %val store <2 x half> %fmul, <2 x half> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll index 9edf55cbc69fe..0c4a77964d154 100644 --- a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll +++ b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll @@ -16,8 +16,8 @@ ; GCN: buffer_load_dword [[U:v[0-9]+]] ; GCN: buffer_load_dword [[V:v[0-9]+]] -; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[V]], [[U]] -; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[Y]], [[X]] +; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[U]], [[V]] +; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[X]], [[Y]] ; GCN-FLUSH-NEXT: buffer_store_dword [[Z]] ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], [[Z]] @@ -49,7 +49,7 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 { ; GCN: buffer_load_dword [[V:v[0-9]+]] ; GCN-FLUSH: v_mad_f32 [[TMP:v[0-9]]], [[U]], [[V]], -[[Z]] -; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[Y]], [[X]] +; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[X]], [[Y]] ; GCN-FLUSH-NEXT: buffer_store_dword [[Z]] ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]] @@ -75,13 +75,13 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 { ; GCN: buffer_load_dword [[U:v[0-9]+]] ; GCN: buffer_load_dword [[V:v[0-9]+]] -; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]] -; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]] -; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]] +; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] +; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] +; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]] -; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]] +; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]] -; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]] +; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]] ; GCN-SLOWFMA: v_mul_f32_e32 ; GCN-SLOWFMA: v_mul_f32_e32 @@ -108,13 +108,13 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 { ; GCN: buffer_load_dword [[U:v[0-9]+]] ; GCN: buffer_load_dword [[V:v[0-9]+]] -; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]] -; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[Y]], [[X]] -; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]] +; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] +; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] +; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]] -; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]] +; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]] -; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]] +; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]] ; GCN-SLOWFMA: v_mul_f32_e32 ; GCN-SLOWFMA: v_mul_f32_e32 @@ -191,17 +191,17 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 ; GCN: buffer_load_dword [[U:v[0-9]+]] ; GCN: buffer_load_dword [[V:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]] +; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] -; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[Y]], [[X]], [[MUL]] -; GCN-FLUSH: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]] +; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]] +; GCN-FLUSH: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]] ; GCN-FASTFMA: v_fma_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]] -; GCN-FASTFMA: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MAD]] +; GCN-FASTFMA: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]] -; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]] +; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] ; GCN-SLOWFMA: v_add_f32_e32 -; GCN-SLOWFMA: v_subrev_f32_e32 [[MAD:v[0-9]+]] +; GCN-SLOWFMA: v_sub_f32_e32 [[MAD:v[0-9]+]] ; GCN: buffer_store_dword [[MUL]] ; GCN: buffer_store_dword [[MAD]] @@ -226,21 +226,21 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 { ; GCN: buffer_load_dword [[U:v[0-9]+]] ; GCN: buffer_load_dword [[V:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[V]], [[U]] +; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]] -; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[Y]], [[X]] -; GCN-FLUSH-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]] +; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]] +; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]] ; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]] ; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]] ; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]] -; GCN-FASTFMA-NEXT: v_subrev_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]] +; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]] ; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]] ; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]] -; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]] +; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]] ; GCN-SLOWFMA: v_add_f32_e32 -; GCN-SLOWFMA: v_subrev_f32_e32 +; GCN-SLOWFMA: v_sub_f32_e32 define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef diff --git a/test/CodeGen/AMDGPU/fadd.f16.ll b/test/CodeGen/AMDGPU/fadd.f16.ll index 08199be144f49..88b3be0e0d31c 100644 --- a/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/test/CodeGen/AMDGPU/fadd.f16.ll @@ -2,13 +2,13 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fadd_f16 -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fadd_f16( @@ -24,7 +24,7 @@ entry: } ; GCN-LABEL: {{^}}fadd_f16_imm_a -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] @@ -42,7 +42,7 @@ entry: } ; GCN-LABEL: {{^}}fadd_f16_imm_b -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 2.0, v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] @@ -60,8 +60,8 @@ entry: } ; GCN-LABEL: {{^}}fadd_v2f16: -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] +; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] @@ -70,16 +70,16 @@ entry: ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -88,15 +88,18 @@ define amdgpu_kernel void @fadd_v2f16( <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) { entry: - %a.val = load <2 x half>, <2 x half> addrspace(1)* %a - %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid + %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid + %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b %r.val = fadd <2 x half> %a.val, %b.val store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } ; GCN-LABEL: {{^}}fadd_v2f16_imm_a: -; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] @@ -105,12 +108,12 @@ entry: ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_mov_b32_e32 v[[CONST2:[0-9]+]], 0x4000 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -118,14 +121,16 @@ define amdgpu_kernel void @fadd_v2f16_imm_a( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) { entry: - %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid + %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b %r.val = fadd <2 x half> <half 1.0, half 2.0>, %b.val store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } ; GCN-LABEL: {{^}}fadd_v2f16_imm_b: -; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] @@ -134,12 +139,12 @@ entry: ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00 ; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]] -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -147,8 +152,15 @@ define amdgpu_kernel void @fadd_v2f16_imm_b( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { entry: - %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid + %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a %r.val = fadd <2 x half> %a.val, <half 2.0, half 1.0> store <2 x half> %r.val, <2 x half> addrspace(1)* %r ret void } + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll index c936d98673ba1..8fd1f52006fbb 100644 --- a/test/CodeGen/AMDGPU/fadd64.ll +++ b/test/CodeGen/AMDGPU/fadd64.ll @@ -5,8 +5,11 @@ ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr inbounds double, double addrspace(1)* %in1, i32 %tid + %gep2 = getelementptr inbounds double, double addrspace(1)* %in2, i32 %tid + %r0 = load double, double addrspace(1)* %gep1 + %r1 = load double, double addrspace(1)* %gep2 %r2 = fadd double %r0, %r1 store double %r2, double addrspace(1)* %out ret void @@ -42,3 +45,8 @@ define amdgpu_kernel void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x do store <2 x double> %r2, <2 x double> addrspace(1)* %out ret void } + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll new file mode 100644 index 0000000000000..5383bbe71ae36 --- /dev/null +++ b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -0,0 +1,487 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GCN-FLUSH %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-EXCEPT -check-prefix=VI -check-prefix=GCN-FLUSH %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-DENORM %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GCN-FLUSH %s + +; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32: +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %v = load float, float addrspace(1)* %gep, align 4 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32: +; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fmul float %load, 15.0 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32: +; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fsub float 15.0, %load + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32: +; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fadd float %load, 15.0 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32: +; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.sqrt.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fceil_value_f32: +; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.ceil.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_floor_value_f32: +; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.floor.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fma_value_f32: +; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.fma.f32(float %load, float 15.0, float 15.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32: +; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.fmuladd.f32(float %load, float 15.0, float 15.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32: +; GCN: flat_load_dword [[LOAD:v[0-9]+]], +; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = call float @llvm.canonicalize.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32: +; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} +; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fpext float %load to double + %canonicalized = tail call double @llvm.canonicalize.f64(double %v) + %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id + store double %canonicalized, double addrspace(1)* %gep2, align 8 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16: +; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id + %load = load half, half addrspace(1)* %gep, align 2 + %v = fpext half %load to float + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id + store float %canonicalized, float addrspace(1)* %gep2, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64: +; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id + %load = load double, double addrspace(1)* %gep, align 8 + %v = fptrunc double %load to float + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id + store float %canonicalized, float addrspace(1)* %gep2, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32: +; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_short v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float addrspace(1)* %arg, half addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fptrunc float %load to half + %canonicalized = tail call half @llvm.canonicalize.f16(half %v) + %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id + store half %canonicalized, half addrspace(1)* %gep2, align 2 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32: +; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}} +; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}} +; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]] +; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}} +; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]] +; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %v = fptrunc <2 x float> %load to <2 x half> + %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v) + %gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 4 + ret void +} + +; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32: +; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, -v{{[0-9]+}} +define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fsub float -0.0, %load + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fneg_value_f32: +; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v0 = fadd float %load, 0.0 + %v = fsub float -0.0, %v0 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32: +; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}| +define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.fabs.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_fabs_value_f32: +; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v0 = fadd float %load, 0.0 + %v = tail call float @llvm.fabs.f32(float %v0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_sin_value_f32: +; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.sin.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_cos_value_f32: +; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.cos.f32(float %load) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_sin_value_f16: +; GCN: v_sin_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}} +; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]] +; GCN: flat_store_short v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id + %load = load half, half addrspace(1)* %gep, align 2 + %v = tail call half @llvm.sin.f16(half %load) + %canonicalized = tail call half @llvm.canonicalize.f16(half %v) + store half %canonicalized, half addrspace(1)* %gep, align 2 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_cos_value_f16: +; GCN: v_cos_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}} +; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]] +; GCN: flat_store_short v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id + %load = load half, half addrspace(1)* %gep, align 2 + %v = tail call half @llvm.cos.f16(half %load) + %canonicalized = tail call half @llvm.canonicalize.f16(half %v) + store half %canonicalized, half addrspace(1)* %gep, align 2 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32: +; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000 +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32: +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.minnum.f32(float %load, float 0.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_minnum_value_f32: +; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v0 = fadd float %load, 0.0 + %v = tail call float @llvm.minnum.f32(float %v0, float 0.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32: +; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7f800001, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 2139095041 to float)) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_denorm_value_f32: +; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float)) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32: +; GCN: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}} +; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = tail call float @llvm.maxnum.f32(float %load, float 0.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32: +; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v0 = fadd float %load, 0.0 + %v = tail call float @llvm.maxnum.f32(float %v0, float 0.0) + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + store float %canonicalized, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64: +; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0 +; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]] +; GCN-NOT: 1.0 +define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id + %load = load double, double addrspace(1)* %gep, align 8 + %v0 = fadd double %load, 0.0 + %v = tail call double @llvm.maxnum.f64(double %v0, double 0.0) + %canonicalized = tail call double @llvm.canonicalize.f64(double %v) + store double %canonicalized, double addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: test_no_fold_canonicalize_fmul_value_f32_no_ieee: +; GCN-EXCEPT: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +define amdgpu_ps float @test_no_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) { +entry: + %v = fmul float %arg, 15.0 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + ret float %canonicalized +} + +; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee: +; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} +; GCN-NEXT: ; return +; GCN-NOT: 1.0 +define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) { +entry: + %v = fmul nnan float %arg, 15.0 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + ret float %canonicalized +} + +declare float @llvm.canonicalize.f32(float) #0 +declare double @llvm.canonicalize.f64(double) #0 +declare half @llvm.canonicalize.f16(half) #0 +declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare float @llvm.sqrt.f32(float) #0 +declare float @llvm.ceil.f32(float) #0 +declare float @llvm.floor.f32(float) #0 +declare float @llvm.fma.f32(float, float, float) #0 +declare float @llvm.fmuladd.f32(float, float, float) #0 +declare float @llvm.fabs.f32(float) #0 +declare float @llvm.sin.f32(float) #0 +declare float @llvm.cos.f32(float) #0 +declare half @llvm.sin.f16(half) #0 +declare half @llvm.cos.f16(half) #0 +declare float @llvm.minnum.f32(float, float) #0 +declare float @llvm.maxnum.f32(float, float) #0 +declare double @llvm.maxnum.f64(double, double) #0 + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 404358f0ecb98..dd8e277c1c75f 100644 --- a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -5,6 +5,8 @@ declare half @llvm.fabs.f16(half) #0 declare half @llvm.canonicalize.f16(half) #0 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 + ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16: ; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} @@ -213,7 +215,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}} ; GFX9: buffer_store_dword [[REG]] define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 { - %val = load <2 x half>, <2 x half> addrspace(1)* %out + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out ret void @@ -233,7 +237,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1) ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}} ; GCN: buffer_store_dword define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { - %val = load <2 x half>, <2 x half> addrspace(1)* %out + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -251,7 +257,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspa ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}} ; GCN: buffer_store_dword define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { - %val = load <2 x half>, <2 x half> addrspace(1)* %out + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) %val.fabs.fneg = fsub <2 x half> <half -0.0, half -0.0>, %val.fabs %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg) @@ -270,7 +278,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}} ; GFX9: buffer_store_dword [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 { - %val = load <2 x half>, <2 x half> addrspace(1)* %out + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep %fneg.val = fsub <2 x half> <half -0.0, half -0.0>, %val %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fcanonicalize.ll b/test/CodeGen/AMDGPU/fcanonicalize.ll index 8c385f40b1c5f..feb4c7bd4a183 100644 --- a/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare float @llvm.fabs.f32(float) #0 declare float @llvm.canonicalize.f32(float) #0 diff --git a/test/CodeGen/AMDGPU/fcmp.f16.ll b/test/CodeGen/AMDGPU/fcmp.f16.ll index 7916226462f77..aef898b1a8ee8 100644 --- a/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fcmp_f16_lt ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -351,23 +351,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_lt -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_lt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_lt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_lt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_lt: +; SI: v_cmp_lt_f32_e32 vcc, +; SI: v_cmp_lt_f32_e32 vcc, + +; VI: v_cmp_lt_f16_e32 vcc, +; VI: v_cmp_lt_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_lt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -382,22 +371,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_eq -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_eq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_eq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_eq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_eq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_eq_f32_e32 vcc, +; SI: v_cmp_eq_f32_e32 vcc, + +; VI: v_cmp_eq_f16_e32 vcc, +; VI: v_cmp_eq_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_eq( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -411,23 +389,11 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_le -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_le_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_le_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_le_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_le_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_le: +; SI: v_cmp_le_f32_e32 vcc +; SI: v_cmp_le_f32_e32 vcc +; VI: v_cmp_le_f16_e32 vcc +; VI: v_cmp_le_f16_e32 vcc define amdgpu_kernel void @fcmp_v2f16_le( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -441,23 +407,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_gt -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_gt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_gt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_gt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_gt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_gt: +; SI: v_cmp_gt_f32_e32 vcc, +; SI: v_cmp_gt_f32_e32 vcc, + +; VI: v_cmp_gt_f16_e32 vcc, +; VI: v_cmp_gt_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_gt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -471,23 +426,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_lg -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_lg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_lg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_lg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_lg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_lg: +; SI: v_cmp_lg_f32_e32 vcc, +; SI: v_cmp_lg_f32_e32 vcc, + +; VI: v_cmp_lg_f16_e32 vcc, +; VI: v_cmp_lg_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_lg( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -501,23 +445,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_ge -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_ge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_ge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_ge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_ge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_ge: +; SI: v_cmp_ge_f32_e32 vcc, +; SI: v_cmp_ge_f32_e32 vcc, + +; VI: v_cmp_ge_f16_e32 vcc, +; VI: v_cmp_ge_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_ge( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -531,23 +464,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_o -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_o_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_o_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_o_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_o_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_o: +; SI: v_cmp_o_f32_e32 vcc, +; SI: v_cmp_o_f32_e32 vcc, + +; VI: v_cmp_o_f16_e32 vcc, +; VI: v_cmp_o_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_o( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -561,23 +483,12 @@ entry: ret void } -; GCN-LABEL: {{^}}fcmp_v2f16_u -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_u_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_u_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_u_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_u_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; GCN-LABEL: {{^}}fcmp_v2f16_u: +; SI: v_cmp_u_f32_e32 vcc, +; SI: v_cmp_u_f32_e32 vcc, + +; VI: v_cmp_u_f16_e32 vcc, +; VI: v_cmp_u_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_u( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -592,22 +503,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_nge -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_nge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_nge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_nge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_nge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_nge_f32_e32 vcc, +; SI: v_cmp_nge_f32_e32 vcc, + +; VI: v_cmp_nge_f16_e32 vcc, +; VI: v_cmp_nge_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_nge( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -622,22 +522,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_nlg -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_nlg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_nlg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_nlg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_nlg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_nlg_f32_e32 vcc +; SI: v_cmp_nlg_f32_e32 vcc + +; VI: v_cmp_nlg_f16_e32 vcc +; VI: v_cmp_nlg_f16_e32 vcc define amdgpu_kernel void @fcmp_v2f16_nlg( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -652,22 +541,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_ngt -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_ngt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_ngt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_ngt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_ngt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_ngt_f32_e32 vcc, +; SI: v_cmp_ngt_f32_e32 vcc, + +; VI: v_cmp_ngt_f16_e32 vcc, +; VI: v_cmp_ngt_f16_e32 vcc, define amdgpu_kernel void @fcmp_v2f16_ngt( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -682,22 +560,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_nle -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_nle_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_nle_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_nle_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_nle_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} + +; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @fcmp_v2f16_nle( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -712,22 +579,11 @@ entry: } ; GCN-LABEL: {{^}}fcmp_v2f16_neq -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_neq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_neq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_neq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_neq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] -; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] -; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] -; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} -; GCN: s_endpgm +; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; SI: v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} + +; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @fcmp_v2f16_neq( <2 x i32> addrspace(1)* %r, <2 x half> addrspace(1)* %a, @@ -744,17 +600,19 @@ entry: ; GCN-LABEL: {{^}}fcmp_v2f16_nlt ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] -; SI: v_cmp_nlt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]] -; VI: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] -; VI: v_cmp_nlt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] +; GCN-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] +; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]] + +; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] +; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]] +; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]] ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]] + +; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16_1]], v[[B_F16_1]] ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}} ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/fcmp64.ll b/test/CodeGen/AMDGPU/fcmp64.ll index b9e1921d4c455..95f7e0be7d9c9 100644 --- a/test/CodeGen/AMDGPU/fcmp64.ll +++ b/test/CodeGen/AMDGPU/fcmp64.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s ; CHECK-LABEL: {{^}}flt_f64: -; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_nge_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 @@ -14,7 +14,7 @@ define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* } ; CHECK-LABEL: {{^}}fle_f64: -; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_ngt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 @@ -26,7 +26,7 @@ define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* } ; CHECK-LABEL: {{^}}fgt_f64: -; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_nle_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 @@ -38,7 +38,7 @@ define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* } ; CHECK-LABEL: {{^}}fge_f64: -; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_nlt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 @@ -50,7 +50,7 @@ define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* } ; CHECK-LABEL: {{^}}fne_f64: -; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_neq_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 @@ -62,7 +62,7 @@ define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1 } ; CHECK-LABEL: {{^}}feq_f64: -; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +; CHECK: v_cmp_nlg_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 diff --git a/test/CodeGen/AMDGPU/fconst64.ll b/test/CodeGen/AMDGPU/fconst64.ll index 1255977962454..ca313d80894a6 100644 --- a/test/CodeGen/AMDGPU/fconst64.ll +++ b/test/CodeGen/AMDGPU/fconst64.ll @@ -6,8 +6,15 @@ ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0 define amdgpu_kernel void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) { - %r1 = load double, double addrspace(1)* %in + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds double, double addrspace(1)* %in, i32 %tid + %r1 = load double, double addrspace(1)* %gep %r2 = fadd double %r1, 5.000000e+00 store double %r2, double addrspace(1)* %out ret void } + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fcopysign.f16.ll b/test/CodeGen/AMDGPU/fcopysign.f16.ll index 4e2bf765cd95f..8e984246cc94d 100644 --- a/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s declare half @llvm.copysign.f16(half, half) declare float @llvm.copysign.f32(float, float) @@ -9,16 +9,18 @@ declare <2 x half> @llvm.copysign.v2f16(<2 x half>, <2 x half>) declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>) declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>) +declare i32 @llvm.amdgcn.workitem.id.x() + ; GCN-LABEL: {{^}}test_copysign_f16: -; SI: buffer_load_ushort v[[SIGN:[0-9]+]] -; SI: buffer_load_ushort v[[MAG:[0-9]+]] +; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] +; SI: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] ; SI: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]] ; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]] -; GFX89: buffer_load_ushort v[[SIGN:[0-9]+]] -; GFX89: buffer_load_ushort v[[MAG:[0-9]+]] +; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] +; GFX89: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] ; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff ; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]] ; GCN: buffer_store_short v[[OUT]] @@ -36,8 +38,8 @@ entry: } ; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32: -; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]] ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]] ; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]] @@ -48,17 +50,20 @@ define amdgpu_kernel void @test_copysign_out_f32_mag_f16_sign_f32( half addrspace(1)* %arg_mag, float addrspace(1)* %arg_sign) { entry: - %mag = load half, half addrspace(1)* %arg_mag + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid + %mag = load half, half addrspace(1)* %arg_mag_gep %mag.ext = fpext half %mag to float - %sign = load float, float addrspace(1)* %arg_sign + %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid + %sign = load float, float addrspace(1)* %arg_sign_gep %out = call float @llvm.copysign.f32(float %mag.ext, float %sign) store float %out, float addrspace(1)* %arg_out ret void } ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64: -; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} +; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]] ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[MAG_EXT_LO:[0-9]+]]:[[MAG_EXT_HI:[0-9]+]]{{\]}}, v[[MAG_EXT]] @@ -70,17 +75,20 @@ define amdgpu_kernel void @test_copysign_out_f64_mag_f16_sign_f64( half addrspace(1)* %arg_mag, double addrspace(1)* %arg_sign) { entry: - %mag = load half, half addrspace(1)* %arg_mag + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid + %mag = load half, half addrspace(1)* %arg_mag_gep %mag.ext = fpext half %mag to double - %sign = load double, double addrspace(1)* %arg_sign + %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid + %sign = load double, double addrspace(1)* %arg_sign_gep %out = call double @llvm.copysign.f64(double %mag.ext, double %sign) store double %out, double addrspace(1)* %arg_out ret void } ; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16: -; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] ; SI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_F32]] @@ -93,8 +101,11 @@ define amdgpu_kernel void @test_copysign_out_f32_mag_f32_sign_f16( float addrspace(1)* %arg_mag, half addrspace(1)* %arg_sign) { entry: - %mag = load float, float addrspace(1)* %arg_mag - %sign = load half, half addrspace(1)* %arg_sign + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid + %mag = load float, float addrspace(1)* %arg_mag_gep + %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid + %sign = load half, half addrspace(1)* %arg_sign_gep %sign.ext = fpext half %sign to float %out = call float @llvm.copysign.f32(float %mag, float %sign.ext) store float %out, float addrspace(1)* %arg_out @@ -102,8 +113,8 @@ entry: } ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16: -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}} -; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}} +; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] ; SI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_F32]] @@ -116,8 +127,11 @@ define amdgpu_kernel void @test_copysign_out_f64_mag_f64_sign_f16( double addrspace(1)* %arg_mag, half addrspace(1)* %arg_sign) { entry: - %mag = load double, double addrspace(1)* %arg_mag - %sign = load half, half addrspace(1)* %arg_sign + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr double, double addrspace(1)* %arg_mag, i32 %tid + %mag = load double, double addrspace(1)* %arg_mag_gep + %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid + %sign = load half, half addrspace(1)* %arg_sign_gep %sign.ext = fpext half %sign to double %out = call double @llvm.copysign.f64(double %mag, double %sign.ext) store double %out, double addrspace(1)* %arg_out @@ -125,8 +139,8 @@ entry: } ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32: -; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]] ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN]] @@ -141,8 +155,11 @@ define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f32( half addrspace(1)* %arg_mag, float addrspace(1)* %arg_sign) { entry: - %mag = load half, half addrspace(1)* %arg_mag - %sign = load float, float addrspace(1)* %arg_sign + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid + %mag = load half, half addrspace(1)* %arg_mag_gep + %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid + %sign = load float, float addrspace(1)* %arg_sign_gep %sign.trunc = fptrunc float %sign to half %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) store half %out, half addrspace(1)* %arg_out @@ -150,8 +167,8 @@ entry: } ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64: -; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} +; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_HI]] @@ -166,8 +183,11 @@ define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f64( half addrspace(1)* %arg_mag, double addrspace(1)* %arg_sign) { entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid %mag = load half, half addrspace(1)* %arg_mag - %sign = load double, double addrspace(1)* %arg_sign + %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid + %sign = load double, double addrspace(1)* %arg_sign_gep %sign.trunc = fptrunc double %sign to half %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) store half %out, half addrspace(1)* %arg_out @@ -175,8 +195,8 @@ entry: } ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16: -; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]] -; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]] ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]] ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]] @@ -193,9 +213,12 @@ define amdgpu_kernel void @test_copysign_out_f16_mag_f32_sign_f16( float addrspace(1)* %arg_mag, half addrspace(1)* %arg_sign) { entry: - %mag = load float, float addrspace(1)* %arg_mag + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid + %mag = load float, float addrspace(1)* %arg_mag_gep %mag.trunc = fptrunc float %mag to half - %sign = load half, half addrspace(1)* %arg_sign + %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid + %sign = load half, half addrspace(1)* %arg_sign_gep %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign) store half %out, half addrspace(1)* %arg_out ret void diff --git a/test/CodeGen/AMDGPU/fdiv.f16.ll b/test/CodeGen/AMDGPU/fdiv.f16.ll index 7f84e973c9582..333143393cb46 100644 --- a/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -27,7 +27,7 @@ ; VI-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]] ; VI-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]] -; VI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[RCP_RHS]], [[CVT_LHS]] +; VI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]] ; VI: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]] ; VI: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -165,7 +165,7 @@ entry: ; VI: flat_load_ushort [[RHS:v[0-9]+]] ; VI: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] -; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]] +; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { @@ -187,7 +187,7 @@ entry: ; VI: flat_load_ushort [[RHS:v[0-9]+]] ; VI: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] -; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]] +; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 { diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll index 738a5adba14fb..bc489454341a0 100644 --- a/test/CodeGen/AMDGPU/fdiv.ll +++ b/test/CodeGen/AMDGPU/fdiv.ll @@ -20,7 +20,7 @@ ; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 ; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] -; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]] +; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] ; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] ; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] ; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] @@ -45,7 +45,7 @@ entry: ; GCN-NOT: s_setreg ; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 ; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] -; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]] +; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] ; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] ; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] ; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] @@ -85,20 +85,11 @@ entry: } ; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32: -; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] -; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] -; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] - -; GCN-NOT: s_setreg -; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 -; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] -; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]] -; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] -; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] -; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] +; GCN-NOT: [[RESULT]] ; GCN-NOT: s_setreg -; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] -; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], +; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { entry: %fdiv = fdiv fast float %a, %b @@ -121,6 +112,21 @@ entry: ret void } +; FUNC-LABEL: {{^}}fdiv_ulp25_f32_fast_math: +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS + +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] +; GCN-NOT: [[RESULT]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @fdiv_ulp25_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 { +entry: + %fdiv = fdiv fast float %a, %b, !fpmath !0 + store float %fdiv, float addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}fdiv_f32_arcp_math: ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS @@ -154,8 +160,9 @@ entry: } ; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32: -; GCN: v_cmp_gt_f32 -; GCN: v_cmp_gt_f32 +; GCN: v_rcp_f32 +; GCN: v_rcp_f32 +; GCN-NOT: v_cmp_gt_f32 define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll index 4113ba8dc1f07..7526d08bdbe52 100644 --- a/test/CodeGen/AMDGPU/fma-combine.ll +++ b/test/CodeGen/AMDGPU/fma-combine.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be ; beneficial even without fp32 denormals, but they do require no-infs-fp-math @@ -387,7 +387,7 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace ; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out, @@ -403,7 +403,7 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out, @@ -419,7 +419,7 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out, @@ -435,7 +435,7 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out, @@ -451,7 +451,7 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y: ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out, @@ -467,7 +467,7 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x: ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out, @@ -483,7 +483,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y: ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out, @@ -499,7 +499,7 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x: ; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out, @@ -515,7 +515,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out, @@ -531,7 +531,7 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out, @@ -547,7 +547,7 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out, @@ -563,7 +563,7 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone: ; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]] +; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]] ; ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]] define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out, @@ -583,8 +583,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out, ; FUNC-LABEL: {{^}}test_f32_interp: ; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]] -; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VT1]], [[VY:v[0-9]]] -; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VT]], [[VX:v[0-9]]] +; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]] +; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]] ; ; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]] ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]] diff --git a/test/CodeGen/AMDGPU/fma.f64.ll b/test/CodeGen/AMDGPU/fma.f64.ll index 4d3f3712621ef..907121f1cd46b 100644 --- a/test/CodeGen/AMDGPU/fma.f64.ll +++ b/test/CodeGen/AMDGPU/fma.f64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s declare double @llvm.fma.f64(double, double, double) nounwind readnone declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone diff --git a/test/CodeGen/AMDGPU/fma.ll b/test/CodeGen/AMDGPU/fma.ll index 659cecb59ebf7..6be4c450a51ed 100644 --- a/test/CodeGen/AMDGPU/fma.ll +++ b/test/CodeGen/AMDGPU/fma.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare float @llvm.fma.f32(float, float, float) nounwind readnone declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll index 7643c3ea533ce..44c80b63bf7c3 100644 --- a/test/CodeGen/AMDGPU/fmax_legacy.ll +++ b/test/CodeGen/AMDGPU/fmax_legacy.ll @@ -10,7 +10,7 @@ declare i32 @llvm.r600.read.tidig.x() #1 ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { @@ -31,7 +31,7 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 @@ -51,7 +51,7 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 @@ -71,7 +71,7 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 @@ -91,7 +91,7 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]] ; EG: MAX define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 diff --git a/test/CodeGen/AMDGPU/fmed3.ll b/test/CodeGen/AMDGPU/fmed3.ll index 27d9261b1fab8..4cfc9fc80fb07 100644 --- a/test/CodeGen/AMDGPU/fmed3.ll +++ b/test/CodeGen/AMDGPU/fmed3.ll @@ -872,8 +872,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(fl ; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] -; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[B]], [[A]] -; GCN: v_min_f32_e32 v{{[0-9]+}}, [[C]], [[MAX]] +; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[A]], [[B]] +; GCN: v_min_f32_e32 v{{[0-9]+}}, [[MAX]], [[C]] define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll index 52336f95a9096..0494295fc15f0 100644 --- a/test/CodeGen/AMDGPU/fmin_legacy.ll +++ b/test/CodeGen/AMDGPU/fmin_legacy.ll @@ -45,7 +45,7 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -64,7 +64,7 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -83,7 +83,7 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -102,7 +102,7 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -121,7 +121,7 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, fl ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 { %tid = call i32 @llvm.r600.read.tidig.x() #1 %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid diff --git a/test/CodeGen/AMDGPU/fmul.f16.ll b/test/CodeGen/AMDGPU/fmul.f16.ll index cd86409e20384..5f120f63d7fe3 100644 --- a/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/test/CodeGen/AMDGPU/fmul.f16.ll @@ -1,14 +1,14 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fmul_f16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fmul_f16( @@ -70,16 +70,16 @@ entry: ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -108,7 +108,7 @@ entry: ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16_imm_a( @@ -134,7 +134,7 @@ entry: ; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fmul_v2f16_imm_b( diff --git a/test/CodeGen/AMDGPU/fmul64.ll b/test/CodeGen/AMDGPU/fmul64.ll index f14233f267b2b..d37d432842f37 100644 --- a/test/CodeGen/AMDGPU/fmul64.ll +++ b/test/CodeGen/AMDGPU/fmul64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s ; FUNC-LABEL: {{^}}fmul_f64: ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} diff --git a/test/CodeGen/AMDGPU/fmuladd.f16.ll b/test/CodeGen/AMDGPU/fmuladd.f16.ll index 9b713419e7471..980d68ceded87 100644 --- a/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -79,7 +79,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half add ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out, @@ -108,7 +108,7 @@ define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out, ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out, @@ -227,8 +227,8 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] -; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { @@ -257,8 +257,8 @@ define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] -; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { @@ -287,7 +287,7 @@ define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| -; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -319,7 +319,7 @@ define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| -; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -347,13 +347,13 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocap ; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]] ; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]] -; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGB]], [[REGA]] +; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] -; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -385,7 +385,7 @@ define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] ; VI-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| -; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { @@ -416,7 +416,7 @@ define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { @@ -444,7 +444,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half add ; VI-DENORM-CONTRACT: v_fma_f16 [[R2]], [[R1]], 2.0, -[[R2]] ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { diff --git a/test/CodeGen/AMDGPU/fmuladd.f32.ll b/test/CodeGen/AMDGPU/fmuladd.f32.ll index e422550266924..4b1e41ff91e17 100644 --- a/test/CodeGen/AMDGPU/fmuladd.f32.ll +++ b/test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -1,12 +1,12 @@ -; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s -; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow. @@ -67,7 +67,7 @@ define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspa ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI-DENORM buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -96,7 +96,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float a ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -125,10 +125,10 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float a ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -160,10 +160,10 @@ define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out, ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -192,7 +192,7 @@ define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out, ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -221,7 +221,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, flo ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]] ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -252,7 +252,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]] ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -282,7 +282,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, flo ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -310,11 +310,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, flo ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -345,11 +345,11 @@ define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %ou ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -379,10 +379,10 @@ define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| ; SI: buffer_store_dword [[RESULT]] @@ -414,10 +414,10 @@ define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocaptur ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] ; SI: buffer_store_dword [[RESULT]] @@ -446,17 +446,17 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias noca ; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] -; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGB]], [[REGA]] +; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]] ; SI-FLUSH: buffer_store_dword [[REGC]] ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]] -; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] +; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] +; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -489,10 +489,10 @@ define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| -; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| -; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] +; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -525,10 +525,10 @@ define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocaptur ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] +; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -556,10 +556,10 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float a ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] +; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI: buffer_store_dword [[RESULT]] ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] diff --git a/test/CodeGen/AMDGPU/fmuladd.f64.ll b/test/CodeGen/AMDGPU/fmuladd.f64.ll index 86e91e04b0fc3..8d91a56ee4211 100644 --- a/test/CodeGen/AMDGPU/fmuladd.f64.ll +++ b/test/CodeGen/AMDGPU/fmuladd.f64.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s -; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s -; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s -; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s ; GCN-LABEL: {{^}}fmuladd_f64: ; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} diff --git a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll index 624610096cbc5..b50a26c023ca3 100644 --- a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll +++ b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll @@ -1,12 +1,12 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s - -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s + +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s declare i32 @llvm.amdgcn.workitem.id.x() #1 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll index 66bf9d0ffb00e..002bc47fb96ae 100644 --- a/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/test/CodeGen/AMDGPU/fneg-combines.ll @@ -9,7 +9,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]] @@ -31,7 +31,7 @@ define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrsp ; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NEXT: buffer_store_dword [[NEG_ADD]] ; GCN-NEXT: buffer_store_dword [[ADD]] @@ -54,7 +54,7 @@ define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] @@ -82,10 +82,10 @@ define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-SAFE: v_subrev_f32_e32 +; GCN-SAFE: v_sub_f32_e32 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, -; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -106,10 +106,10 @@ define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-SAFE: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] -; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -133,7 +133,7 @@ define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float ; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]] ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] -; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -157,11 +157,11 @@ define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, fl ; GCN-SAFE: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1{{$}} ; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[A]], [[SIGNBIT]] -; GCN-SAFE: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[ADD]], [[SIGNBIT]] ; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] -; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]] +; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]] ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]] define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { @@ -185,10 +185,10 @@ define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* % ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} -; GCN-SAFE-DAG: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] -; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]] +; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]] ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]] @@ -235,7 +235,7 @@ define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrsp ; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL]] ; GCN: buffer_store_dword [[ADD]] @@ -280,7 +280,7 @@ define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out ; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -300,7 +300,7 @@ define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float ; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -342,7 +342,7 @@ define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, fl ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] -; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL]] ; GCN: buffer_store_dword [[NEG_A]] define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { @@ -364,7 +364,7 @@ define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* % ; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]] ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN-NEXT: buffer_store_dword [[NEG_MUL]] ; GCN: buffer_store_dword [[MUL]] @@ -974,7 +974,7 @@ define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] -; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]] +; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]] ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]] ; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] @@ -1000,7 +1000,7 @@ define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrs ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] -; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]] +; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]] ; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]] ; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]] @@ -1449,7 +1449,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float ; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] ; GCN: buffer_store_dword [[ADD]] @@ -1494,7 +1494,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addr ; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1514,7 +1514,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out ; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; GCN-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1556,7 +1556,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* % ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] -; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]] ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] ; GCN: buffer_store_dword [[NEG_A]] define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { @@ -1578,7 +1578,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspac ; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]] ; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] ; GCN: buffer_store_dword [[MUL]] @@ -1664,7 +1664,7 @@ define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addr ; GCN-LABEL: {{^}}v_fneg_round_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: v_trunc_f32_e32 -; GCN: v_subrev_f32_e32 +; GCN: v_sub_f32_e32 ; GCN: v_cndmask_b32 ; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} @@ -1782,11 +1782,11 @@ define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] -; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]] ; GCN: s_cbranch_scc1 ; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]] -; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[XOR]] +; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]] ; GCN: buffer_store_dword [[MUL1]] ; GCN: buffer_store_dword [[MUL0]] @@ -1851,7 +1851,7 @@ define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float ; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]] ; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]] ; GCN: ; use [[NEG]] ; GCN: buffer_store_dword [[MUL]] @@ -1984,8 +1984,8 @@ define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* ; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]] ; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0 -; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[FMA0]] -; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[D]], [[FMA0]] +; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]] +; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]] ; GCN: buffer_store_dword [[MUL1]] ; GCN-NEXT: buffer_store_dword [[MUL2]] @@ -2084,7 +2084,7 @@ define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* ; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]] ; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]] ; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]] -; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[D]], [[TRUNC_A]] +; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]] ; GCN: buffer_store_dword [[FMA0]] ; GCN: buffer_store_dword [[MUL1]] define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 { diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index f4afaca2b7a7f..56aea641d16e6 100644 --- a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16: ; CI: v_cvt_f32_f16_e32 ; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}| -; CI: v_subrev_f32_e32 v{{[0-9]+}}, [[CVT_ABS_X]], v{{[0-9]+}} +; CI: v_sub_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_ABS_X]] ; GFX89-NOT: _and ; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}| @@ -20,7 +20,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, ; GCN-LABEL: {{^}}fneg_fabs_fmul_f16: ; CI-DAG: v_cvt_f32_f16_e32 ; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{v[0-9]+}}| -; CI: v_mul_f32_e32 {{v[0-9]+}}, [[CVT_NEG_ABS_X]], {{v[0-9]+}} +; CI: v_mul_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, [[CVT_NEG_ABS_X]] ; CI: v_cvt_f16_f32_e32 ; GFX89-NOT: _and diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll index 0a7346f410c94..3f20ca73e9228 100644 --- a/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: ; SI-NOT: and diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll index 2d94726cbe204..49d6742527468 100644 --- a/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/test/CodeGen/AMDGPU/fneg.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s ; FIXME: Should be able to do scalar op ; GCN-LABEL: {{^}}s_fneg_f16: @@ -46,7 +46,7 @@ define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 { ; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]] ; CI-DAG: v_cvt_f32_f16_e64 [[NEG_CVT0:v[0-9]+]], -[[NEG_VALUE]] -; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_VAL]], [[NEG_CVT0]] +; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[NEG_CVT0]], [[CVT_VAL]] ; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]] diff --git a/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir index 986c6b296c962..3155b7a8664fb 100644 --- a/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir +++ b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir @@ -1,26 +1,5 @@ # RUN: llc -march=amdgcn -run-pass peephole-opt -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s - ---- | - define amdgpu_kernel void @no_fold_imm_madak_mac_clamp_f32() #0 { - ret void - } - - define amdgpu_kernel void @no_fold_imm_madak_mac_omod_f32() #0 { - ret void - } - - define amdgpu_kernel void @no_fold_imm_madak_mad_clamp_f32() #0 { - ret void - } - - define amdgpu_kernel void @no_fold_imm_madak_mad_omod_f32() #0 { - ret void - } - - attributes #0 = { nounwind } - ... ---- # GCN-LABEL: name: no_fold_imm_madak_mac_clamp_f32 # GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec # GCN-NEXT: %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec @@ -62,14 +41,14 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 + %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec %28 = REG_SEQUENCE %3, 1, %27, 2 %11 = S_MOV_B32 61440 @@ -133,14 +112,14 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 + %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec %28 = REG_SEQUENCE %3, 1, %27, 2 %11 = S_MOV_B32 61440 @@ -204,14 +183,14 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 + %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec %28 = REG_SEQUENCE %3, 1, %27, 2 %11 = S_MOV_B32 61440 @@ -275,14 +254,14 @@ liveins: - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '%vgpr0', virtual-reg: '%3' } body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 + %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec %28 = REG_SEQUENCE %3, 1, %27, 2 %11 = S_MOV_B32 61440 diff --git a/test/CodeGen/AMDGPU/fold-operands-order.mir b/test/CodeGen/AMDGPU/fold-operands-order.mir index afde89d6b64bc..51bb357fcf6ee 100644 --- a/test/CodeGen/AMDGPU/fold-operands-order.mir +++ b/test/CodeGen/AMDGPU/fold-operands-order.mir @@ -1,10 +1,4 @@ # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands -o - %s | FileCheck -check-prefix=GCN %s - ---- | - define amdgpu_kernel void @mov_in_use_list_2x() { - unreachable - } - ... --- diff --git a/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/test/CodeGen/AMDGPU/fp32_to_fp16.ll index 2c6b1cb18f7e6..579a1454dd9ae 100644 --- a/test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ b/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone diff --git a/test/CodeGen/AMDGPU/fpext.f16.ll b/test/CodeGen/AMDGPU/fpext.f16.ll index 15cc73b9ee53e..ec19fd199b4ec 100644 --- a/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/test/CodeGen/AMDGPU/fpext.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s ; GCN-LABEL: {{^}}fpext_f16_to_f32 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -154,7 +154,7 @@ entry: ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] ; GCN-DAG: v_cvt_f32_f16_e64 [[CVTA_NEG:v[0-9]+]], -[[A]] ; SI-DAG: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]] -; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA]], [[CVTA_NEG]] +; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA_NEG]], [[CVTA]] ; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] ; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]] diff --git a/test/CodeGen/AMDGPU/fptosi.f16.ll b/test/CodeGen/AMDGPU/fptosi.f16.ll index f310618d8bdb6..f593030764a99 100644 --- a/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fptosi_f16_to_i16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -60,7 +60,7 @@ entry: ; SI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] ; SI: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]] ; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] -; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]] +; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_LO]], v[[R_I16_HI]] ; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 diff --git a/test/CodeGen/AMDGPU/fptoui.f16.ll b/test/CodeGen/AMDGPU/fptoui.f16.ll index 7641c08e33c36..cebe3304d542b 100644 --- a/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}fptoui_f16_to_i16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -60,7 +60,7 @@ entry: ; SI: v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]] ; SI: v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]] ; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]] -; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]] +; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_HI]] ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 diff --git a/test/CodeGen/AMDGPU/fptrunc.f16.ll b/test/CodeGen/AMDGPU/fptrunc.f16.ll index bc72f4424c98f..64df625d4bb5a 100644 --- a/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}fptrunc_f32_to_f16: ; GCN: buffer_load_dword v[[A_F32:[0-9]+]] @@ -38,10 +38,10 @@ entry: ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] ; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] @@ -68,7 +68,7 @@ entry: ; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] ; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] diff --git a/test/CodeGen/AMDGPU/fract.f64.ll b/test/CodeGen/AMDGPU/fract.f64.ll index 9a56cbe983cdd..1314dfe3c7cab 100644 --- a/test/CodeGen/AMDGPU/fract.f64.ll +++ b/test/CodeGen/AMDGPU/fract.f64.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s declare double @llvm.fabs.f64(double) #0 declare double @llvm.floor.f64(double) #0 diff --git a/test/CodeGen/AMDGPU/fract.ll b/test/CodeGen/AMDGPU/fract.ll index 207fe280c9a69..2217f67da7d3b 100644 --- a/test/CodeGen/AMDGPU/fract.ll +++ b/test/CodeGen/AMDGPU/fract.ll @@ -1,15 +1,15 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s declare float @llvm.fabs.f32(float) #0 declare float @llvm.floor.f32(float) #0 ; GCN-LABEL: {{^}}fract_f32: ; GCN-SAFE: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]] -; GCN-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]] +; GCN-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[INPUT]], [[FLR]] ; GCN-UNSAFE: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]] diff --git a/test/CodeGen/AMDGPU/frem.ll b/test/CodeGen/AMDGPU/frem.ll index 9778069d0477b..3b8f58cc18a7b 100644 --- a/test/CodeGen/AMDGPU/frem.ll +++ b/test/CodeGen/AMDGPU/frem.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}frem_f32: ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}} @@ -29,7 +29,7 @@ define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1) ; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16 ; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}} ; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]] -; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]] +; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[X]], [[INVY]] ; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]] ; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]] ; GCN: buffer_store_dword [[RESULT]] diff --git a/test/CodeGen/AMDGPU/fsqrt.f64.ll b/test/CodeGen/AMDGPU/fsqrt.f64.ll index 453d8fb37f2f4..186757e4c5d84 100644 --- a/test/CodeGen/AMDGPU/fsqrt.f64.ll +++ b/test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_safe_fsqrt_f64: ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} diff --git a/test/CodeGen/AMDGPU/fsqrt.ll b/test/CodeGen/AMDGPU/fsqrt.ll index a0fd3411ca05c..6bd9a0db14f66 100644 --- a/test/CodeGen/AMDGPU/fsqrt.ll +++ b/test/CodeGen/AMDGPU/fsqrt.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x) diff --git a/test/CodeGen/AMDGPU/fsub.f16.ll b/test/CodeGen/AMDGPU/fsub.f16.ll index fa00c06546dbd..15a4ce2d88f7d 100644 --- a/test/CodeGen/AMDGPU/fsub.f16.ll +++ b/test/CodeGen/AMDGPU/fsub.f16.ll @@ -1,15 +1,15 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}fsub_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_sub_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; GFX89: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GFX89: v_sub_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fsub_f16( @@ -70,16 +70,16 @@ entry: ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; VI-DAG: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1] @@ -109,12 +109,12 @@ entry: ; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 ; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] neg_lo:[1,0] neg_hi:[1,0] @@ -143,12 +143,12 @@ entry: ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00 ; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]] -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000 ; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]{{$}} diff --git a/test/CodeGen/AMDGPU/fsub.ll b/test/CodeGen/AMDGPU/fsub.ll index e7a92d95d4859..48647a2cdb898 100644 --- a/test/CodeGen/AMDGPU/fsub.ll +++ b/test/CodeGen/AMDGPU/fsub.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_fsub_f32: -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 %a = load float, float addrspace(1)* %in, align 4 @@ -41,10 +41,10 @@ define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16 @@ -67,7 +67,7 @@ define amdgpu_kernel void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x flo } ; FUNC-LABEL: {{^}}v_fneg_fsub_f32: -; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]] define amdgpu_kernel void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 @@ -80,7 +80,7 @@ define amdgpu_kernel void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrs } ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_f32: -; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI-NOT: xor define amdgpu_kernel void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 @@ -93,7 +93,7 @@ define amdgpu_kernel void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float a } ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_attribute_f32: -; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI-NOT: xor define amdgpu_kernel void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 @@ -109,7 +109,7 @@ define amdgpu_kernel void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %ou ; make sure it is disabled and the fneg is not folded if it is not ; "true". ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_false_attribute_f32: -; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]] define amdgpu_kernel void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 { %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 diff --git a/test/CodeGen/AMDGPU/fsub64.ll b/test/CodeGen/AMDGPU/fsub64.ll index dc332414a1527..73f1a69eeb9d6 100644 --- a/test/CodeGen/AMDGPU/fsub64.ll +++ b/test/CodeGen/AMDGPU/fsub64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare double @llvm.fabs.f64(double) #0 diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll index 1f72ec65588ea..bb2a6ba8e3483 100644 --- a/test/CodeGen/AMDGPU/ftrunc.f64.ll +++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s declare double @llvm.trunc.f64(double) nounwind readnone declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll index 19e592f50beaf..4e50f995d27e7 100644 --- a/test/CodeGen/AMDGPU/global-extload-i16.ll +++ b/test/CodeGen/AMDGPU/global-extload-i16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FIXME: cypress is broken because the bigger testcases spill and it's not implemented diff --git a/test/CodeGen/AMDGPU/global-smrd-unknown.ll b/test/CodeGen/AMDGPU/global-smrd-unknown.ll new file mode 100644 index 0000000000000..8a576e6480a11 --- /dev/null +++ b/test/CodeGen/AMDGPU/global-smrd-unknown.ll @@ -0,0 +1,20 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -memdep-block-scan-limit=1 -amdgpu-scalarize-global-loads -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; GCN-LABEL: {{^}}unknown_memdep_analysis: +; GCN: flat_load_dword +; GCN: flat_load_dword +; GCN: flat_store_dword +define amdgpu_kernel void @unknown_memdep_analysis(float addrspace(1)* nocapture readonly %arg) #0 { +bb: + %tmp53 = load float, float addrspace(1)* undef, align 4 + %tmp54 = getelementptr inbounds float, float addrspace(1)* %arg, i32 31 + %tmp55 = load float, float addrspace(1)* %tmp54, align 4 + %tmp56 = tail call float @llvm.fmuladd.f32(float undef, float %tmp53, float %tmp55) + store float %tmp56, float addrspace(1)* undef, align 4 + ret void +} + +declare float @llvm.fmuladd.f32(float, float, float) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll index 41ae5a4a0b00b..43745d4b3da3d 100644 --- a/test/CodeGen/AMDGPU/half.ll +++ b/test/CodeGen/AMDGPU/half.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; half args should be promoted to float for SI and lower. @@ -17,7 +17,7 @@ define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { ; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 ; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 ; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] -; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]] +; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: s_endpgm define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { @@ -471,10 +471,10 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, ; SI-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] ; SI-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] -; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]] +; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]] ; VI-DAG: v_cvt_f16_f32_sdwa [[CVT1:v[0-9]+]], v[[HI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT1]], [[CVT0]] +; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[CVT1]] ; GCN-DAG: buffer_store_dword [[PACKED]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll index c2668a077b098..8cda01a10f765 100644 --- a/test/CodeGen/AMDGPU/imm.ll +++ b/test/CodeGen/AMDGPU/imm.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; Use a 64-bit value with lo bits that can be represented as an inline constant ; GCN-LABEL: {{^}}i64_imm_inline_lo: diff --git a/test/CodeGen/AMDGPU/immv216.ll b/test/CodeGen/AMDGPU/immv216.ll index cd3502baee7be..fe86a58729681 100644 --- a/test/CodeGen/AMDGPU/immv216.ll +++ b/test/CodeGen/AMDGPU/immv216.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s ; FIXME: Merge into imm.ll ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16: @@ -305,7 +305,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}} ; VI-DAG: buffer_load_dword ; VI-NOT: and -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}} +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: buffer_store_dword diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 0d20c32a4770c..62200b988bea3 100644 --- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s ; Tests for indirect addressing on SI, which is implemented using dynamic ; indexing of vectors. diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll index c0f5218efc16b..75826d530cb04 100644 --- a/test/CodeGen/AMDGPU/inline-asm.ll +++ b/test/CodeGen/AMDGPU/inline-asm.ll @@ -222,9 +222,9 @@ entry: ; FIXME: Should be scheduled to shrink vcc ; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2: ; CHECK: v_cmp_eq_u32_e32 vcc, 1, v0 -; CHECK: v_cmp_eq_u32_e64 s[0:1], 1, v1 ; CHECK: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; CHECK: v_cmp_eq_u32_e32 vcc, 1, v1 +; CHECK: v_cndmask_b32_e64 v1, 0, -1, vcc define amdgpu_kernel void @i1_input_phys_vgpr_x2() { entry: %val0 = load volatile i1, i1 addrspace(1)* undef diff --git a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll index 5cd965d2fa9c3..eea26192ed322 100644 --- a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll +++ b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GatherAllAliases gives up on trying to analyze cases where the ; pointer may have been loaded from an aliased store, so make sure diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll index f08d4b6c79156..06dc2cc8b90e1 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.fabs.f16(half %a) declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll index 1fcdac537fba6..f71b9752e9a10 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare i1 @llvm.amdgcn.class.f32(float, i32) #1 declare i1 @llvm.amdgcn.class.f64(double, i32) #1 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll index 2cc63ae74bf10..1b3e09a81e5a0 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s ; FIXME: Enable for VI. diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll index fe211d356070c..7068f45590551 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll index 593c95856811e..871b8c4f99b99 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg: ; GCN: v_bfe_i32 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll index 495e36b09f8fa..39370e41e8aa9 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare i32 @llvm.amdgcn.sffbh.i32(i32) #1 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll index e0cec2134e70c..8468aa3a7b3ef 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll index 92e3a1099da0a..68fd08f778c43 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}bfe_u32_arg_arg_arg: ; GCN: v_bfe_u32 diff --git a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll index 0604a49372a2b..071f2a6de4cd0 100644 --- a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.ceil.f16(half %a) declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a) @@ -33,12 +33,12 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_ceil_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/test/CodeGen/AMDGPU/llvm.cos.f16.ll index d836ea36ef632..8931de63e74ba 100644 --- a/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.cos.f16(half %a) declare <2 x half> @llvm.cos.v2f16(<2 x half> %a) @@ -29,8 +29,8 @@ entry: ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]] -; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]] ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -48,8 +48,8 @@ entry: ; GCN-NOT: and ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @cos_v2f16( diff --git a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll index 5757142b9e954..4e96a76197160 100644 --- a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.exp2.f16(half %a) declare <2 x half> @llvm.exp2.v2f16(<2 x half> %a) @@ -33,12 +33,12 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_exp_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/test/CodeGen/AMDGPU/llvm.floor.f16.ll index 6a18141d8035e..74d1e694ffbe2 100644 --- a/test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.floor.f16(half %a) declare <2 x half> @llvm.floor.v2f16(<2 x half> %a) @@ -33,12 +33,12 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_floor_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/test/CodeGen/AMDGPU/llvm.fma.f16.ll index 3f4fba7d8ead0..a379b18ffb8b6 100644 --- a/test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.fma.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) @@ -128,7 +128,7 @@ define amdgpu_kernel void @fma_f16_imm_c( ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16( @@ -167,7 +167,7 @@ define amdgpu_kernel void @fma_v2f16( ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16_imm_a( @@ -210,7 +210,7 @@ define amdgpu_kernel void @fma_v2f16_imm_a( ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16_imm_b( @@ -253,7 +253,7 @@ define amdgpu_kernel void @fma_v2f16_imm_b( ; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_v2f16_imm_c( diff --git a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index 806723e5136ca..2d4fe08d8bde8 100644 --- a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s -; RUN: llc -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s declare half @llvm.fmuladd.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) @@ -13,11 +13,11 @@ declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_mac_f32_e32 v[[C_F32]], v[[B_F32]], v[[A_F32]] +; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] ; SI: buffer_store_short v[[R_F16]] -; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]] +; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] ; VI-FLUSH: buffer_store_short v[[C_F16]] ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] @@ -110,19 +110,19 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] -; SI: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]] -; SI: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] +; SI: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] ; VI-FLUSH: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[B_V2_F16]], v[[C_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]] +; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] ; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]] ; VI-FLUSH-NOT: v_and_b32 -; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]] +; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[R_F16_HI]] ; VI-DENORM: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; VI-DENORM: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] @@ -131,7 +131,7 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]] ; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]] ; VI-DENORM-NOT: v_and_b32 -; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[RES0]] +; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/llvm.log2.f16.ll b/test/CodeGen/AMDGPU/llvm.log2.f16.ll index 773eb55283e44..277195c532086 100644 --- a/test/CodeGen/AMDGPU/llvm.log2.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.log2.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.log2.f16(half %a) declare <2 x half> @llvm.log2.v2f16(<2 x half> %a) @@ -33,12 +33,12 @@ entry: ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index 8f4b314ffabb2..c72716439a761 100644 --- a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.maxnum.f16(half %a, half %b) declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) @@ -9,9 +9,9 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @maxnum_f16( @@ -73,18 +73,18 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -115,7 +115,7 @@ entry: ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @maxnum_v2f16_imm_a( @@ -143,7 +143,7 @@ entry: ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @maxnum_v2f16_imm_b( diff --git a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 1a86286f7136c..0e93acc27dc5b 100644 --- a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.minnum.f16(half %a, half %b) declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) @@ -9,9 +9,9 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] +; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @minnum_f16( @@ -72,18 +72,18 @@ entry: ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]] -; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] +; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] +; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] -; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -116,7 +116,7 @@ entry: ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @minnum_v2f16_imm_a( @@ -144,7 +144,7 @@ entry: ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define amdgpu_kernel void @minnum_v2f16_imm_b( diff --git a/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/test/CodeGen/AMDGPU/llvm.rint.f16.ll index 30cb969a76e5a..92282083984bc 100644 --- a/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s declare half @llvm.rint.f16(half %a) declare <2 x half> @llvm.rint.v2f16(<2 x half> %a) @@ -34,12 +34,12 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: v_and_b32 -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NOT: v_and_b32 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll index ffe87977870ba..7e29147571f2f 100644 --- a/test/CodeGen/AMDGPU/llvm.round.ll +++ b/test/CodeGen/AMDGPU/llvm.round.ll @@ -12,7 +12,7 @@ ; GCN: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]] ; GCN: v_cmp_ge_f32_e64 vcc, |[[SUB]]|, 0.5 ; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]] -; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]] +; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TRUNC]], [[SEL]] ; GCN: buffer_store_dword [[RESULT]] ; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]] @@ -70,7 +70,7 @@ define amdgpu_kernel void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x floa ; GFX89: v_sub_f16_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]] ; GFX89: v_cmp_ge_f16_e64 vcc, |[[SUB]]|, 0.5 ; GFX89: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[COPYSIGN]] -; GFX89: v_add_f16_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]] +; GFX89: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TRUNC]], [[SEL]] ; GFX89: buffer_store_short [[RESULT]] define amdgpu_kernel void @round_f16(half addrspace(1)* %out, i32 %x.arg) #0 { %x.arg.trunc = trunc i32 %x.arg to i16 diff --git a/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/test/CodeGen/AMDGPU/llvm.sin.f16.ll index eb1f32c981f88..08b9d9d873b49 100644 --- a/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.sin.f16(half %a) declare <2 x half> @llvm.sin.v2f16(<2 x half> %a) @@ -29,9 +29,9 @@ entry: ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]] ; SI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]] -; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]] +; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]] ; SI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]] ; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] @@ -47,10 +47,10 @@ entry: ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll index 46ee6526aca2f..0e1358ecca226 100644 --- a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.sqrt.f16(half %a) declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) @@ -33,12 +33,12 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: v_and_b32 -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_sqrt_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NOT: v_and_b32 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll index dc7182aa0d89a..37ee4e92c6379 100644 --- a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.trunc.f16(half %a) declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a) @@ -33,12 +33,12 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI-NOT: v_and_b32 -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] ; VI-DAG: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] ; VI-DAG: v_trunc_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NOT: v_and_b32 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/load-global-f32.ll b/test/CodeGen/AMDGPU/load-global-f32.ll index bd6fea587b42f..77557a584093f 100644 --- a/test/CodeGen/AMDGPU/load-global-f32.ll +++ b/test/CodeGen/AMDGPU/load-global-f32.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_f32: ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}} diff --git a/test/CodeGen/AMDGPU/load-global-f64.ll b/test/CodeGen/AMDGPU/load-global-f64.ll index 5b772e1fe5ee3..84214b7dbc106 100644 --- a/test/CodeGen/AMDGPU/load-global-f64.ll +++ b/test/CodeGen/AMDGPU/load-global-f64.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_f64: ; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll index e3415b9c47dec..cb2495d5fdcf7 100644 --- a/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/test/CodeGen/AMDGPU/load-global-i16.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s ; FIXME: r600 is broken because the bigger testcases spill and it's not implemented diff --git a/test/CodeGen/AMDGPU/load-global-i32.ll b/test/CodeGen/AMDGPU/load-global-i32.ll index 5df32c1e3120a..6360d39666c77 100644 --- a/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/test/CodeGen/AMDGPU/load-global-i32.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i32: diff --git a/test/CodeGen/AMDGPU/load-global-i64.ll b/test/CodeGen/AMDGPU/load-global-i64.ll index de16b6c8997ef..c71db0b7357cd 100644 --- a/test/CodeGen/AMDGPU/load-global-i64.ll +++ b/test/CodeGen/AMDGPU/load-global-i64.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i64: ; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] diff --git a/test/CodeGen/AMDGPU/load-global-i8.ll b/test/CodeGen/AMDGPU/load-global-i8.ll index fc0cbf916b529..3fe6bd26be14f 100644 --- a/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/test/CodeGen/AMDGPU/load-global-i8.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i8: diff --git a/test/CodeGen/AMDGPU/load-weird-sizes.ll b/test/CodeGen/AMDGPU/load-weird-sizes.ll index d6162c388b5b1..f9ba6241fe067 100644 --- a/test/CodeGen/AMDGPU/load-weird-sizes.ll +++ b/test/CodeGen/AMDGPU/load-weird-sizes.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}load_i24: ; SI: {{flat|buffer}}_load_ubyte diff --git a/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll index 74564f387edeb..e1a2af6c7ef90 100644 --- a/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -amdgpu-lower-intrinsics %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -amdgpu-lower-intrinsics -use-wide-memcpy-loop-lowering=true %s | FileCheck -check-prefix=WOPT %s declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1 declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1 @@ -21,6 +22,17 @@ define amdgpu_kernel void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* ; OPT-NEXT: load i8 ; OPT: getelementptr ; OPT-NEXT: store i8 + +; WOPT-LABEL: @min_size_large_static_memcpy_caller0( +; WOPT-NOT: call +; WOPT: br label %load-store-loop +; WOPT: [[T1:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %src, i64 %loop-index +; WOPT-NEXT: [[T2:%[0-9]+]] = load i8, i8 addrspace(1)* [[T1]] +; WOPT-NEXT: [[T3:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %loop-index +; WOPT-NEXT: store i8 [[T2]], i8 addrspace(1)* [[T3]] +; WOPT-NEXT: [[T4:%[0-9]+]] = add i64 %loop-index, 1 +; WOPT-NEXT: [[T5:%[0-9]+]] = icmp ult i64 [[T4]], 1025 +; WOPT-NEXT: br i1 [[T5]], label %load-store-loop, label %memcpy-split define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false) ret void diff --git a/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir b/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir new file mode 100644 index 0000000000000..768acf35eeae3 --- /dev/null +++ b/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir @@ -0,0 +1,227 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: cluster_add_addc +# GCN: S_NOP 0, implicit-def %vcc +# GCN: dead %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec +# GCN: dead %4, dead %5 = V_ADDC_U32_e64 %6, %7, %3, implicit %exec +name: cluster_add_addc +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec + %6 = V_MOV_B32_e32 0, implicit %exec + %7 = V_MOV_B32_e32 0, implicit %exec + S_NOP 0, implicit def %vcc + %4, %5 = V_ADDC_U32_e64 %6, %7, %3, implicit %exec +... + +# GCN-LABEL: name: interleave_add64s +# GCN: dead %8, %9 = V_ADD_I32_e64 %0, %1, implicit %exec +# GCN-NEXT: dead %12, dead %13 = V_ADDC_U32_e64 %4, %5, %9, implicit %exec +# GCN-NEXT: dead %10, %11 = V_ADD_I32_e64 %2, %3, implicit %exec +# GCN-NEXT: dead %14, dead %15 = V_ADDC_U32_e64 %6, %7, %11, implicit %exec +name: interleave_add64s +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: vgpr_32 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + - { id: 8, class: vgpr_32 } + - { id: 9, class: sreg_64 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: sreg_64 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: sreg_64 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: sreg_64 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %2 = V_MOV_B32_e32 0, implicit %exec + %3 = V_MOV_B32_e32 0, implicit %exec + %4 = V_MOV_B32_e32 0, implicit %exec + %5 = V_MOV_B32_e32 0, implicit %exec + %6 = V_MOV_B32_e32 0, implicit %exec + %7 = V_MOV_B32_e32 0, implicit %exec + + %8, %9 = V_ADD_I32_e64 %0, %1, implicit %exec + %10, %11 = V_ADD_I32_e64 %2, %3, implicit %exec + + + %12, %13 = V_ADDC_U32_e64 %4, %5, %9, implicit %exec + %14, %15 = V_ADDC_U32_e64 %6, %7, %11, implicit %exec +... + +# GCN-LABEL: name: cluster_mov_addc +# GCN: S_NOP 0, implicit-def %vcc +# GCN-NEXT: %2 = S_MOV_B64 0 +# GCN-NEXT: dead %3, dead %4 = V_ADDC_U32_e64 %0, %1, %2, implicit %exec +name: cluster_mov_addc +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: sreg_64 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %2 = S_MOV_B64 0 + S_NOP 0, implicit def %vcc + %3, %4 = V_ADDC_U32_e64 %0, %1, %2, implicit %exec +... + +# GCN-LABEL: name: no_cluster_add_addc_diff_sgpr +# GCN: dead %2, dead %3 = V_ADD_I32_e64 %0, %1, implicit %exec +# GCN-NEXT: %6 = V_MOV_B32_e32 0, implicit %exec +# GCN-NEXT: %7 = V_MOV_B32_e32 0, implicit %exec +# GCN-NEXT: S_NOP 0, implicit-def %vcc +# GCN-NEXT: %8 = S_MOV_B64 0 +# GCN-NEXT: dead %4, dead %5 = V_ADDC_U32_e64 %6, %7, %8, implicit %exec +name: no_cluster_add_addc_diff_sgpr +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + - { id: 8, class: sreg_64 } +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %8 = S_MOV_B64 0 + %2, %3 = V_ADD_I32_e64 %0, %1, implicit %exec + %6 = V_MOV_B32_e32 0, implicit %exec + %7 = V_MOV_B32_e32 0, implicit %exec + S_NOP 0, implicit def %vcc + %4, %5 = V_ADDC_U32_e64 %6, %7, %8, implicit %exec +... +# GCN-LABEL: name: cluster_sub_subb +# GCN: S_NOP 0, implicit-def %vcc +# GCN: dead %2, %3 = V_SUB_I32_e64 %0, %1, implicit %exec +# GCN: dead %4, dead %5 = V_SUBB_U32_e64 %6, %7, %3, implicit %exec +name: cluster_sub_subb +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %2, %3 = V_SUB_I32_e64 %0, %1, implicit %exec + %6 = V_MOV_B32_e32 0, implicit %exec + %7 = V_MOV_B32_e32 0, implicit %exec + S_NOP 0, implicit def %vcc + %4, %5 = V_SUBB_U32_e64 %6, %7, %3, implicit %exec +... + +# GCN-LABEL: name: cluster_cmp_cndmask +# GCN: S_NOP 0, implicit-def %vcc +# GCN-NEXT: %3 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec +# GCN-NEXT: dead %4 = V_CNDMASK_B32_e64 %0, %1, %3, implicit %exec +name: cluster_cmp_cndmask +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %3 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec + S_NOP 0, implicit def %vcc + %4 = V_CNDMASK_B32_e64 %0, %1, %3, implicit %exec +... + +# GCN-LABEL: name: cluster_multi_use_cmp_cndmask +# GCN: %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec +# GCN-NEXT: dead %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec +# GCN-NEXT: dead %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec +name: cluster_multi_use_cmp_cndmask +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 5, class: vgpr_32 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %2 = V_MOV_B32_e32 0, implicit %exec + %3 = V_MOV_B32_e32 0, implicit %exec + + %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec + S_NOP 0, implicit def %vcc + %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec + %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec +... + +# GCN-LABEL: name: cluster_multi_use_cmp_cndmask2 +# GCN: %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec +# GCN-NEXT: dead %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec +# GCN-NEXT: %3 = V_MOV_B32_e32 0, implicit %exec +# GCN-NEXT: dead %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec +name: cluster_multi_use_cmp_cndmask2 +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 5, class: vgpr_32 } + - { id: 6, class: vgpr_32 } + - { id: 7, class: vgpr_32 } + +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %1 = V_MOV_B32_e32 0, implicit %exec + %4 = V_CMP_EQ_I32_e64 %0, %1, implicit %exec + %2 = V_MOV_B32_e32 0, implicit %exec + %5 = V_CNDMASK_B32_e64 %2, %1, %4, implicit %exec + %3 = V_MOV_B32_e32 0, implicit %exec + %6 = V_CNDMASK_B32_e64 %1, %3, %4, implicit %exec +... diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll index b855fc500c6b4..8a6bf853a7c6a 100644 --- a/test/CodeGen/AMDGPU/mad-combine.ll +++ b/test/CodeGen/AMDGPU/mad-combine.ll @@ -19,15 +19,15 @@ declare float @llvm.fmuladd.f32(float, float, float) #0 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]] +; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]] ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] ; SI-DENORM-SLOWFMAF-NOT: v_fma ; SI-DENORM-SLOWFMAF-NOT: v_mad -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; SI-STD: buffer_store_dword [[C]] @@ -55,15 +55,15 @@ define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-STD-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]] -; SI-STD-DAG: v_mac_f32_e32 [[D]], [[B]], [[A]] +; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]] +; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]] ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] -; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] ; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} @@ -99,11 +99,11 @@ define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]] +; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]] ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] ; SI-STD: buffer_store_dword [[C]] @@ -133,8 +133,8 @@ define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] ; SI: buffer_store_dword [[RESULT]] define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { @@ -167,9 +167,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} @@ -205,8 +205,8 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* no ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] ; SI: buffer_store_dword [[RESULT]] define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { @@ -238,9 +238,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} @@ -278,7 +278,7 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* no ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] ; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] +; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] ; SI: buffer_store_dword [[RESULT]] define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { @@ -313,8 +313,8 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] ; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} @@ -355,9 +355,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1 ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] ; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]] ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} @@ -395,13 +395,13 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] -; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]] +; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]] -; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] -; SI-DENORM: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]] +; SI-DENORM: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP1]], [[C]] ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { @@ -437,13 +437,13 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace( ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] -; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] +; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]] -; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] -; SI-DENORM: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] +; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]] ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm @@ -479,21 +479,21 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace( ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[A]] -; SI-STD-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP0]] +; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] +; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]] +; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[C]] ; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], [[D]], [[E]], -[[C]] -; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[B]], [[A]] +; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[A]], [[B]] -; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] -; SI-DENORM-FASTFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]] +; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]] -; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]] +; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] +; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[A]], [[B]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]] +; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[C]] ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm @@ -530,21 +530,21 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace( ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[C]], [[B]] -; SI-STD-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[A]] +; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] +; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]] +; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP0]] ; SI-STD-UNSAFE: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] ; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] -; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] ; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] -; SI-DENORM-FASTFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] +; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]] -; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]] +; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[C]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]] +; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP2]] ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll index 8e0014911def8..77c35fac8b5de 100644 --- a/test/CodeGen/AMDGPU/madak.ll +++ b/test/CodeGen/AMDGPU/madak.ll @@ -34,8 +34,8 @@ define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float add ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], [[VK]] -; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]] +; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]] +; GCN-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]] ; GCN: s_endpgm define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -199,7 +199,7 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia ; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] ; GCN: buffer_load_dword [[VGPR:v[0-9]+]] ; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]] +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]] ; GCN: buffer_store_dword [[MUL]] define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 { bb: diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll index 6bc40e82459bb..b78d65ae1e1a1 100644 --- a/test/CodeGen/AMDGPU/madmk.ll +++ b/test/CodeGen/AMDGPU/madmk.ll @@ -32,8 +32,8 @@ define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float add ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: v_mac_f32_e32 [[VB]], [[VK]], [[VA]] -; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]] +; GCN-DAG: v_mac_f32_e32 [[VB]], [[VA]], [[VK]] +; GCN-DAG: v_mac_f32_e32 [[VC]], [[VA]], [[VK]] ; GCN: s_endpgm define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/test/CodeGen/AMDGPU/max.ll b/test/CodeGen/AMDGPU/max.ll index ffcdac03bc74c..6387c9ff6dfaf 100644 --- a/test/CodeGen/AMDGPU/max.ll +++ b/test/CodeGen/AMDGPU/max.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_test_imax_sge_i32: diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll index dfd5b97fcc865..6b0ec483247cf 100644 --- a/test/CodeGen/AMDGPU/merge-stores.ll +++ b/test/CodeGen/AMDGPU/merge-stores.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s ; This test is mostly to test DAG store merging, so disable the vectorizer. ; Run with devices with different unaligned load restrictions. diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll index b23b21118aaa3..97666492e376f 100644 --- a/test/CodeGen/AMDGPU/mubuf.ll +++ b/test/CodeGen/AMDGPU/mubuf.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s declare i32 @llvm.amdgcn.workitem.id.x() readnone diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll index 57c50c9804e56..a0290789175d3 100644 --- a/test/CodeGen/AMDGPU/mul.ll +++ b/test/CodeGen/AMDGPU/mul.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC ; mul24 and mad24 are affected diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index 82c27f204a478..ba3ff0b08bc92 100644 --- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -66,9 +66,9 @@ ; FIXME: Why is this compare essentially repeated? ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]] -; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]] ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1 +; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]] +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc ; GCN: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec diff --git a/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/test/CodeGen/AMDGPU/no-shrink-extloads.ll index 8a7bf6db5b8d4..500e4cb3cc73e 100644 --- a/test/CodeGen/AMDGPU/no-shrink-extloads.ll +++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll index eb082843fb829..8e6885c4fc5e8 100644 --- a/test/CodeGen/AMDGPU/or.ll +++ b/test/CodeGen/AMDGPU/or.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}or_v2i32: diff --git a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll index f83eb56dc6edf..776b151e30170 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) #0 declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) #0 diff --git a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll index ecb513cd80b6e..d8c7438e4d0da 100644 --- a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll +++ b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}reduce_i64_load_align_4_width_to_i32: ; GCN: buffer_load_dword [[VAL:v[0-9]+]] diff --git a/test/CodeGen/AMDGPU/regcoal-subrange-join.mir b/test/CodeGen/AMDGPU/regcoal-subrange-join.mir new file mode 100644 index 0000000000000..bac348aaed709 --- /dev/null +++ b/test/CodeGen/AMDGPU/regcoal-subrange-join.mir @@ -0,0 +1,162 @@ +# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck --check-prefix=GCN %s +# +# See bug http://llvm.org/PR33524 for details of the problem being checked here +# This test will provoke a subrange join (see annotations below) during simple register coalescing +# Without a fix for PR33524 this causes an unreachable in SubRange Join +# +# GCN-DAG: undef %[[REG0:[0-9]+]].sub0 = COPY %sgpr5 +# GCN-DAG: undef %[[REG1:[0-9]+]].sub0 = COPY %sgpr2 +# GCN-DAG: %[[REG0]].sub1 = S_MOV_B32 1 +# GCN-DAG: %[[REG1]].sub1 = S_MOV_B32 1 + +--- | + define amdgpu_vs void @regcoal-subrange-join(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 %arg6) local_unnamed_addr #0 { + ret void + } + +... +--- +name: regcoal-subrange-join +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: vreg_128 } + - { id: 2, class: vreg_128 } + - { id: 3, class: vreg_128 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: sreg_32_xm0, preferred-register: '%8' } + - { id: 7, class: vreg_128 } + - { id: 8, class: sreg_32_xm0, preferred-register: '%6' } + - { id: 9, class: vreg_128 } + - { id: 10, class: sgpr_32 } + - { id: 11, class: sgpr_32 } + - { id: 12, class: sgpr_32 } + - { id: 13, class: sgpr_32 } + - { id: 14, class: sgpr_32 } + - { id: 15, class: sgpr_32 } + - { id: 16, class: vgpr_32 } + - { id: 17, class: sreg_32_xm0 } + - { id: 18, class: sreg_64 } + - { id: 19, class: sreg_32_xm0 } + - { id: 20, class: sreg_32_xm0 } + - { id: 21, class: sreg_64 } + - { id: 22, class: sreg_32_xm0_xexec } + - { id: 23, class: sreg_32_xm0 } + - { id: 24, class: sreg_64_xexec } + - { id: 25, class: sreg_128 } + - { id: 26, class: sreg_64_xexec } + - { id: 27, class: sreg_32_xm0_xexec } + - { id: 28, class: sreg_32_xm0 } + - { id: 29, class: vgpr_32 } + - { id: 30, class: vgpr_32 } + - { id: 31, class: vgpr_32 } + - { id: 32, class: vgpr_32 } + - { id: 33, class: vgpr_32 } + - { id: 34, class: vgpr_32 } + - { id: 35, class: vgpr_32 } + - { id: 36, class: vgpr_32 } + - { id: 37, class: vgpr_32 } + - { id: 38, class: sreg_128 } + - { id: 39, class: sreg_64_xexec } + - { id: 40, class: sreg_32_xm0_xexec } + - { id: 41, class: sreg_32_xm0 } + - { id: 42, class: vgpr_32 } + - { id: 43, class: vgpr_32 } + - { id: 44, class: vgpr_32 } + - { id: 45, class: vgpr_32 } + - { id: 46, class: vgpr_32 } + - { id: 47, class: vgpr_32 } + - { id: 48, class: vgpr_32 } + - { id: 49, class: vgpr_32 } + - { id: 50, class: vgpr_32 } + - { id: 51, class: sreg_128 } + - { id: 52, class: vgpr_32 } + - { id: 53, class: vgpr_32 } + - { id: 54, class: vgpr_32 } + - { id: 55, class: vgpr_32 } + - { id: 56, class: vreg_128 } + - { id: 57, class: vreg_128 } + - { id: 58, class: vreg_128 } + - { id: 59, class: sreg_32_xm0 } + - { id: 60, class: sreg_32_xm0 } + - { id: 61, class: vreg_128 } +liveins: + - { reg: '%sgpr2', virtual-reg: '%12' } + - { reg: '%sgpr5', virtual-reg: '%15' } +body: | + bb.0: + liveins: %sgpr2, %sgpr5 + + %15 = COPY killed %sgpr5 + %12 = COPY killed %sgpr2 + %17 = S_MOV_B32 1 + undef %18.sub1 = COPY %17 + %0 = COPY %18 + %0.sub0 = COPY killed %12 + %21 = COPY killed %18 + %21.sub0 = COPY killed %15 + %22 = S_LOAD_DWORD_IMM killed %21, 2, 0 + %23 = S_MOV_B32 491436 + undef %24.sub0 = COPY killed %22 + %24.sub1 = COPY killed %23 + %25 = S_LOAD_DWORDX4_IMM killed %24, 0, 0 + %1 = COPY killed %25 + %26 = S_LOAD_DWORDX2_IMM %0, 2, 0 + dead %27 = S_LOAD_DWORD_IMM killed %26, 0, 0 + S_CBRANCH_SCC0 %bb.1, implicit undef %scc + + bb.5: + %58 = COPY killed %1 + %59 = COPY killed %17 + S_BRANCH %bb.2 + + bb.1: + %30 = V_MOV_B32_e32 1036831949, implicit %exec + %31 = V_ADD_F32_e32 %30, %1.sub3, implicit %exec + %33 = V_ADD_F32_e32 %30, %1.sub2, implicit %exec + %35 = V_ADD_F32_e32 %30, %1.sub1, implicit %exec + %37 = V_ADD_F32_e32 killed %30, killed %1.sub0, implicit %exec + undef %56.sub0 = COPY killed %37 + %56.sub1 = COPY killed %35 + %56.sub2 = COPY killed %33 + %56.sub3 = COPY killed %31 + %28 = S_MOV_B32 0 + %2 = COPY killed %56 + %58 = COPY killed %2 + %59 = COPY killed %28 + + bb.2: + %4 = COPY killed %59 + %3 = COPY killed %58 + %39 = S_LOAD_DWORDX2_IMM killed %0, 6, 0 + %40 = S_LOAD_DWORD_IMM killed %39, 0, 0 + %43 = V_MOV_B32_e32 -1102263091, implicit %exec + %60 = COPY killed %4 + %61 = COPY killed %3 + + bb.3: + successors: %bb.3, %bb.4 + + %7 = COPY killed %61 + %6 = COPY killed %60 + %8 = S_ADD_I32 killed %6, 1, implicit-def dead %scc + %44 = V_ADD_F32_e32 %43, %7.sub3, implicit %exec + %46 = V_ADD_F32_e32 %43, %7.sub2, implicit %exec + %48 = V_ADD_F32_e32 %43, %7.sub1, implicit %exec + %50 = V_ADD_F32_e32 %43, killed %7.sub0, implicit %exec + undef %57.sub0 = COPY killed %50 + %57.sub1 = COPY killed %48 + %57.sub2 = COPY %46 + %57.sub3 = COPY killed %44 + S_CMP_LT_I32 %8, %40, implicit-def %scc + %60 = COPY killed %8 + %61 = COPY killed %57 + S_CBRANCH_SCC1 %bb.3, implicit killed %scc + S_BRANCH %bb.4 + + bb.4: + EXP 32, undef %53, undef %54, killed %46, undef %55, 0, 0, 15, implicit %exec + S_ENDPGM + +... diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll index ff4069226a62b..260b32ed3406c 100644 --- a/test/CodeGen/AMDGPU/reorder-stores.ll +++ b/test/CodeGen/AMDGPU/reorder-stores.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store: ; SI: buffer_load_dwordx4 diff --git a/test/CodeGen/AMDGPU/rotl.i64.ll b/test/CodeGen/AMDGPU/rotl.i64.ll index 266490718dd18..fa29d789cebee 100644 --- a/test/CodeGen/AMDGPU/rotl.i64.ll +++ b/test/CodeGen/AMDGPU/rotl.i64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s ; BOTH-LABEL: {{^}}s_rotl_i64: ; BOTH-DAG: s_lshl_b64 diff --git a/test/CodeGen/AMDGPU/rotr.i64.ll b/test/CodeGen/AMDGPU/rotr.i64.ll index 9eda479cd25c2..af58b404ca6c6 100644 --- a/test/CodeGen/AMDGPU/rotr.i64.ll +++ b/test/CodeGen/AMDGPU/rotr.i64.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s ; BOTH-LABEL: {{^}}s_rotr_i64: ; BOTH-DAG: s_sub_i32 diff --git a/test/CodeGen/AMDGPU/rsq.ll b/test/CodeGen/AMDGPU/rsq.ll index 9462683efe0e8..204eeb9983868 100644 --- a/test/CodeGen/AMDGPU/rsq.ll +++ b/test/CodeGen/AMDGPU/rsq.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s -; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare float @llvm.sqrt.f32(float) nounwind readnone @@ -48,8 +48,8 @@ define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float ; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]] ; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]] -; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]] -; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] +; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RSQA]], [[RCPB]] +; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] ; SI-UNSAFE: buffer_store_dword [[RESULT]] ; SI-SAFE-NOT: v_rsq_f32 diff --git a/test/CodeGen/AMDGPU/s_movk_i32.ll b/test/CodeGen/AMDGPU/s_movk_i32.ll index a131aaa3dfb4f..797fbc2712b0f 100644 --- a/test/CodeGen/AMDGPU/s_movk_i32.ll +++ b/test/CodeGen/AMDGPU/s_movk_i32.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; SI-LABEL: {{^}}s_movk_i32_k0: ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}} diff --git a/test/CodeGen/AMDGPU/sad.ll b/test/CodeGen/AMDGPU/sad.ll index f7a1c65881d02..ee56e9053fd3f 100644 --- a/test/CodeGen/AMDGPU/sad.ll +++ b/test/CodeGen/AMDGPU/sad.ll @@ -134,8 +134,8 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, ; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2: ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GCN-DAG: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { %icmp0 = icmp ugt i32 %a, %b %sub0 = sub i32 %a, %b diff --git a/test/CodeGen/AMDGPU/saddo.ll b/test/CodeGen/AMDGPU/saddo.ll index 586a455b2b91e..09e87d524419e 100644 --- a/test/CodeGen/AMDGPU/saddo.ll +++ b/test/CodeGen/AMDGPU/saddo.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll index 6e1dd16383337..d5b2fa0b67540 100644 --- a/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 diff --git a/test/CodeGen/AMDGPU/scalar_to_vector.ll b/test/CodeGen/AMDGPU/scalar_to_vector.ll index 62d0d93678858..0f09fa17423e6 100644 --- a/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -1,12 +1,12 @@ -; RUN: llc -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; XXX - Why the packing? ; GCN-LABEL: {{^}}scalar_to_vector_v2i32: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], ; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]] ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]] -; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]] +; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHR]], [[SHL]] ; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]] ; GCN: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}} define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { diff --git a/test/CodeGen/AMDGPU/schedule-global-loads.ll b/test/CodeGen/AMDGPU/schedule-global-loads.ll index 44d46086f02af..2dddba8bccc76 100644 --- a/test/CodeGen/AMDGPU/schedule-global-loads.ll +++ b/test/CodeGen/AMDGPU/schedule-global-loads.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s ; FIXME: This currently doesn't do a great job of clustering the ; loads, which end up with extra moves between them. Right now, it diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll index 6b1e85915a110..4ae9871865f5e 100644 --- a/test/CodeGen/AMDGPU/scratch-buffer.ll +++ b/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -1,5 +1,5 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s ; When a frame index offset is more than 12-bits, make sure we don't store ; it in mubuf's offset field. diff --git a/test/CodeGen/AMDGPU/scratch-simple.ll b/test/CodeGen/AMDGPU/scratch-simple.ll index abd15f1fb47f8..6ed730ad60f42 100644 --- a/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/test/CodeGen/AMDGPU/scratch-simple.ll @@ -12,10 +12,8 @@ ; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] -; GCN-DAG: v_mov_b32_e32 [[C200:v[0-9]+]], 0x200 -; GCN-DAG: v_mov_b32_e32 [[C400:v[0-9]+]], 0x400 -; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[C200]], [[CLAMP_IDX]] -; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[C400]], [[CLAMP_IDX]] +; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]] +; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]] ; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen ; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll index 7ec6ca809b685..305107f690fb8 100644 --- a/test/CodeGen/AMDGPU/sdiv.ll +++ b/test/CodeGen/AMDGPU/sdiv.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; The code generated by sdiv is long and complex and may frequently change. ; The goal of this test is to make sure the ISel doesn't fail. diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll index 0dc7cc309f7c9..0d181c2c34b85 100644 --- a/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s ; GCN-LABEL: {{^}}add_shr_i32: ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} @@ -35,7 +35,7 @@ define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* ; GCN-LABEL: {{^}}mul_shr_i32: ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST1]], v[[DST0]] +; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST0]], v[[DST1]] ; NOSDWA-NOT: v_mul_u32_u24_sdwa ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -68,9 +68,9 @@ entry: ; GCN-LABEL: {{^}}mul_v2i16: ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]] ; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] ; NOSDWA-NOT: v_mul_u32_u24_sdwa ; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -168,14 +168,14 @@ entry: ; GCN-LABEL: {{^}}mul_v2half: ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]] ; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] ; NOSDWA-NOT: v_mul_f16_sdwa ; VI-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] +; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -362,9 +362,9 @@ entry: ; GCN-LABEL: {{^}}mac_v2half: ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST1]], v[[DST0]] +; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST0]], v[[DST1]] ; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] ; NOSDWA-NOT: v_mac_f16_sdwa ; VI: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -491,7 +491,7 @@ entry: %tmp17 = shufflevector <2 x i8> %tmp10, <2 x i8> %tmp12, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - + %arrayidx5 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %destValues, i64 %idxprom store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8 ret void diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll index 3417eb02b3614..e0619251f9204 100644 --- a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll +++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll @@ -103,7 +103,7 @@ define amdgpu_kernel void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 { ; GCN: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_ABS]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -122,7 +122,7 @@ define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 { ; GCN: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -154,7 +154,7 @@ define amdgpu_kernel void @add_select_fabs_negk_negk_f32(i32 %c) #0 { ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]] define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -171,7 +171,7 @@ define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]] ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -191,7 +191,7 @@ define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]] ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[FABS_X]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -245,7 +245,7 @@ define amdgpu_kernel void @add_select_posk_fabs_f32(i32 %c) #0 { ; GCN: buffer_load_dword [[Z:v[0-9]+]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -266,8 +266,8 @@ define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 { ; GCN: buffer_load_dword [[W:v[0-9]+]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc -; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] -; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[X]], [[W]] +; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[W]], [[X]] define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -291,7 +291,7 @@ define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 { ; GCN-DAG: v_xor_b32_e32 [[NEG_X:v[0-9]+]], 0x80000000, [[X]] ; GCN-DAG: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc -; GCN-DAG: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[SELECT]], [[Z]] +; GCN-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[Z]], [[SELECT]] ; GCN: buffer_store_dword [[ADD]] ; GCN: buffer_store_dword [[NEG_X]] @@ -316,8 +316,8 @@ define amdgpu_kernel void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) ; GCN: buffer_load_dword [[W:v[0-9]+]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc -; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] -; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[Y]], [[W]] +; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN-DAG: v_sub_f32_e32 v{{[0-9]+}}, [[W]], [[Y]] define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -341,7 +341,7 @@ define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 { ; GCN: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_NEG]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -359,7 +359,7 @@ define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 { ; GCN: buffer_load_dword [[Y:v[0-9]+]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -377,7 +377,7 @@ define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 { ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -397,7 +397,7 @@ define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 { ; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc ; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -414,7 +414,7 @@ define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 { ; GCN: v_cmp_eq_u32_e64 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]] define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -431,7 +431,7 @@ define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 { ; GCN: v_cmp_eq_u32_e64 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]] define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -445,7 +445,7 @@ define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 { ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]] define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %cmp = icmp eq i32 %c, 0 @@ -462,7 +462,7 @@ define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 { ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -479,7 +479,7 @@ define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 { ; GCN: buffer_load_dword [[Y:v[0-9]+]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -497,7 +497,7 @@ define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 { ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -517,7 +517,7 @@ define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 { ; GCN-DAG: v_or_b32_e32 [[X_NEG_ABS:v[0-9]+]], 0x80000000, [[X]] ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG_ABS]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -540,7 +540,7 @@ define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 { ; GCN-DAG: v_or_b32_e32 [[Y_NEG_ABS:v[0-9]+]], 0x80000000, [[Y]] ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG_ABS]], [[X_ABS]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -563,7 +563,7 @@ define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 { ; GCN-DAG: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]] ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -585,7 +585,7 @@ define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN-DAG: v_xor_b32_e32 [[Y_NEG:v[0-9]+]], 0x80000000, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG]], [[X_ABS]], vcc -; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] +; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -606,7 +606,7 @@ define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef @@ -628,7 +628,7 @@ define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 { ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]] ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[X_ABS]], [[Y]], vcc -; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]] define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 { %x = load volatile float, float addrspace(1)* undef %y = load volatile float, float addrspace(1)* undef diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll index ebbc675b2babe..b77ebcf5bf529 100644 --- a/test/CodeGen/AMDGPU/select-vectors.ll +++ b/test/CodeGen/AMDGPU/select-vectors.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; Test expansion of scalar selects on vectors. ; Evergreen not enabled since it seems to be having problems with doubles. diff --git a/test/CodeGen/AMDGPU/select.f16.ll b/test/CodeGen/AMDGPU/select.f16.ll index 92ee2eb7f403f..e79ce3af0cf9d 100644 --- a/test/CodeGen/AMDGPU/select.f16.ll +++ b/test/CodeGen/AMDGPU/select.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}select_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -104,8 +104,8 @@ entry: ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[D_F32]], vcc ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}} +; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm @@ -134,8 +134,8 @@ entry: ; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[C_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: v_mov_b32_e32 v[[D_F16:[0-9]+]], 0x3800{{$}} +; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm @@ -159,16 +159,16 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e64 ; SI: v_cmp_lt_f32_e32 ; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e64 +; SI: v_cmp_lt_f32_e32 +; SI: v_cndmask_b32_e32 ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 -; VI: v_cmp_lt_f16_e64 ; VI: v_cmp_lt_f16_e32 -; VI: v_cndmask_b32_e64 +; VI: v_cndmask_b32_e32 +; VI: v_cmp_lt_f16_e32 ; VI: v_cndmask_b32_e32 ; GCN: s_endpgm @@ -196,13 +196,17 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI-DAG: v_cmp_gt_f32_e64 -; SI-DAG: v_cmp_lt_f32_e32 vcc, 0.5 -; VI: v_cmp_lt_f16_e32 -; VI: v_cmp_gt_f16_e64 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e64 +; SI: v_cmp_lt_f32_e32 vcc, 0.5 +; SI: v_cndmask_b32_e32 +; SI: v_cmp_gt_f32_e32 +; SI: v_cndmask_b32_e32 + +; VI: v_cmp_lt_f16_e32 +; VI: v_cndmask_b32_e32 +; VI: v_cmp_gt_f16_e32 +; VI: v_cndmask_b32_e32 + ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm @@ -228,13 +232,16 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI-DAG: v_cmp_lt_f32_e64 -; SI-DAG: v_cmp_gt_f32_e32 vcc, 0.5 -; VI: v_cmp_gt_f16_e32 -; VI: v_cmp_lt_f16_e64 -; GCN: v_cndmask_b32_e32 -; GCN: v_cndmask_b32_e64 +; SI: v_cmp_gt_f32_e32 vcc, 0.5 +; SI: v_cndmask_b32_e32 +; SI: v_cmp_lt_f32_e32 +; SI: v_cndmask_b32_e32 + +; VI: v_cmp_gt_f16_e32 +; VI: v_cndmask_b32_e32 +; VI: v_cmp_lt_f16_e32 +; VI: v_cndmask_b32_e32 ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 @@ -263,8 +270,8 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cmp_nlt_f32_e32 -; SI: v_cmp_nlt_f32_e64 -; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e32 +; SI: v_cmp_nlt_f32_e32 ; SI: v_cndmask_b32_e32 ; VI: v_cmp_nlt_f16_e32 @@ -298,13 +305,17 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e64 + ; SI: v_cmp_lt_f32_e32 +; SI: v_cndmask_b32 +; SI: v_cmp_lt_f32_e32 +; SI: v_cndmask_b32 ; VI: v_cmp_lt_f16_e32 -; VI: v_cmp_lt_f16_e64 -; GCN: v_cndmask_b32 -; GCN: v_cndmask_b32 +; VI: v_cndmask_b32 +; VI: v_cmp_lt_f16_e32 +; VI: v_cndmask_b32 + ; SI: v_cvt_f16_f32_e32 ; SI: v_cvt_f16_f32_e32 ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/setcc-fneg-constant.ll b/test/CodeGen/AMDGPU/setcc-fneg-constant.ll index 8d455d84bf9e7..bcaa1aa54c15f 100644 --- a/test/CodeGen/AMDGPU/setcc-fneg-constant.ll +++ b/test/CodeGen/AMDGPU/setcc-fneg-constant.ll @@ -7,7 +7,7 @@ ; GCN: buffer_load_dword [[B:v[0-9]+]] ; GCN: buffer_load_dword [[C:v[0-9]+]] -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]] ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL]] ; GCN: buffer_store_dword [[MUL]] define amdgpu_kernel void @multi_use_fneg_src() #0 { @@ -30,7 +30,7 @@ define amdgpu_kernel void @multi_use_fneg_src() #0 { ; GCN: buffer_load_dword [[B:v[0-9]+]] ; GCN: buffer_load_dword [[C:v[0-9]+]] -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]] ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[A]] ; GCN: v_mul_f32_e64 [[USE1:v[0-9]+]], [[MUL]], -[[MUL]] define amdgpu_kernel void @multi_foldable_use_fneg_src() #0 { @@ -78,7 +78,7 @@ define amdgpu_kernel void @multi_use_fneg() #0 { ; GCN: buffer_load_dword [[A:v[0-9]+]] ; GCN: buffer_load_dword [[B:v[0-9]+]] -; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]] +; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]] ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL0]] ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[MUL0]], [[MUL0]] ; GCN: buffer_store_dword [[MUL1]] diff --git a/test/CodeGen/AMDGPU/setcc.ll b/test/CodeGen/AMDGPU/setcc.ll index f63719d62a847..a3bf167e756af 100644 --- a/test/CodeGen/AMDGPU/setcc.ll +++ b/test/CodeGen/AMDGPU/setcc.ll @@ -7,8 +7,8 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y -; GCN-DAG: v_cmp_eq_u32_e32 -; GCN-DAG: v_cmp_eq_u32_e64 +; GCN: v_cmp_eq_u32_e32 +; GCN: v_cmp_eq_u32_e32 define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %result = icmp eq <2 x i32> %a, %b %sext = sext <2 x i1> %result to <2 x i32> @@ -23,9 +23,9 @@ define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> % ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; GCN: v_cmp_eq_u32_e32 -; GCN: v_cmp_eq_u32_e64 -; GCN: v_cmp_eq_u32_e64 -; GCN: v_cmp_eq_u32_e64 +; GCN: v_cmp_eq_u32_e32 +; GCN: v_cmp_eq_u32_e32 +; GCN: v_cmp_eq_u32_e32 define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll index 160fb6a038fed..5b4d9ed259b60 100644 --- a/test/CodeGen/AMDGPU/sext-in-reg.ll +++ b/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FIXME: i16 promotion pass ruins the scalar cases when legal. ; FIXME: r600 fails verifier diff --git a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll index fb0bbaa9cbf27..8250bad7b0a10 100644 --- a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll +++ b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s ; Copy VGPR -> SGPR used twice as an instruction operand, which is then ; used in an REG_SEQUENCE that also needs to be handled. diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll index 931051102cd5c..3b24cf82d783b 100644 --- a/test/CodeGen/AMDGPU/sgpr-copy.ll +++ b/test/CodeGen/AMDGPU/sgpr-copy.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}phi1: ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 diff --git a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll index 4f7b61adc91d5..2f9eed457ab6d 100644 --- a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; Extract the high bit of the 1st quarter ; GCN-LABEL: {{^}}v_uextract_bit_31_i128: @@ -98,7 +98,7 @@ define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 ; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v{{[[0-9]+}} ; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]] +; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[ELT1PART]], v[[SHLLO]] ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}} ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} diff --git a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll index c70eb9b9c4a53..670287ba79373 100644 --- a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half. diff --git a/test/CodeGen/AMDGPU/shift-i64-opts.ll b/test/CodeGen/AMDGPU/shift-i64-opts.ll index 5306e190a4f9c..f3faa39c64e68 100644 --- a/test/CodeGen/AMDGPU/shift-i64-opts.ll +++ b/test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FAST64 -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=SLOW64 -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FAST64 -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=SLOW64 -check-prefix=GCN %s ; lshr (i64 x), c: c > 32 => reg_sequence lshr (i32 hi_32(x)), (c - 32), 0 diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll index edc313ee323bd..13ac9140b8273 100644 --- a/test/CodeGen/AMDGPU/shl.ll +++ b/test/CodeGen/AMDGPU/shl.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() #0 diff --git a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir index 6248d8a46daf6..767118eb8d118 100644 --- a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir +++ b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir @@ -6,92 +6,7 @@ # that the post-RA run does manage to shrink it, but right now the # resume crashes ---- | - define amdgpu_kernel void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %tid.ext = sext i32 %tid to i64 - %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext - %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext - %a = load volatile i32, i32 addrspace(1)* %a.ptr - %b = load volatile i32, i32 addrspace(1)* %b.ptr - %result = add i32 %a, %b - store volatile i32 %result, i32 addrspace(1)* %out.gep - ret void - } - - define amdgpu_kernel void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %tid.ext = sext i32 %tid to i64 - %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext - %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext - %a = load volatile i32, i32 addrspace(1)* %a.ptr - %b = load volatile i32, i32 addrspace(1)* %b.ptr - %result = sub i32 %a, %b - store volatile i32 %result, i32 addrspace(1)* %out.gep - ret void - } - - define amdgpu_kernel void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %tid.ext = sext i32 %tid to i64 - %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext - %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext - %a = load volatile i32, i32 addrspace(1)* %a.ptr - %b = load volatile i32, i32 addrspace(1)* %b.ptr - %result = sub i32 %a, %b - store volatile i32 %result, i32 addrspace(1)* %out.gep - ret void - } - - define amdgpu_kernel void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %tid.ext = sext i32 %tid to i64 - %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext - %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext - %a = load volatile i32, i32 addrspace(1)* %a.ptr - %b = load volatile i32, i32 addrspace(1)* %b.ptr - %result = add i32 %a, %b - store volatile i32 %result, i32 addrspace(1)* %out.gep - ret void - } - - define amdgpu_kernel void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %tid.ext = sext i32 %tid to i64 - %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext - %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext - %a = load volatile i32, i32 addrspace(1)* %a.ptr - %b = load volatile i32, i32 addrspace(1)* %b.ptr - %result = add i32 %a, %b - store volatile i32 %result, i32 addrspace(1)* %out.gep - ret void - } - - define amdgpu_kernel void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %tid.ext = sext i32 %tid to i64 - %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext - %b.ptr = getelementptr i32, i32 addrspace(1)* %a.ptr, i32 1 - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 %tid.ext - %a = load volatile i32, i32 addrspace(1)* %a.ptr - %b = load volatile i32, i32 addrspace(1)* %b.ptr - %result = add i32 %a, %b - store volatile i32 %result, i32 addrspace(1)* %out.gep - ret void - } - - declare i32 @llvm.amdgcn.workitem.id.x() #1 - - attributes #0 = { nounwind } - attributes #1 = { nounwind readnone } - ... ---- # GCN-LABEL: name: shrink_add_vop3{{$}} # GCN: %29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec # GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec @@ -151,13 +66,13 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec %27 = REG_SEQUENCE %3, 1, %26, 2 %10 = S_MOV_B32 61440 @@ -166,11 +81,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit %exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec %29, %9 = V_ADD_I32_e64 %19, %17, implicit %exec %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... @@ -235,13 +150,13 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec %27 = REG_SEQUENCE %3, 1, %26, 2 %10 = S_MOV_B32 61440 @@ -250,11 +165,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit %exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec %29, %9 = V_SUB_I32_e64 %19, %17, implicit %exec %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... @@ -319,13 +234,13 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec %27 = REG_SEQUENCE %3, 1, %26, 2 %10 = S_MOV_B32 61440 @@ -334,11 +249,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit %exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec %29, %9 = V_SUBREV_I32_e64 %19, %17, implicit %exec %24 = V_CNDMASK_B32_e64 0, 1, killed %9, implicit %exec - BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... @@ -402,13 +317,13 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec %27 = REG_SEQUENCE %3, 1, %26, 2 %10 = S_MOV_B32 61440 @@ -417,18 +332,18 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit %exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec %9 = S_MOV_B64 0 %29, %vcc = V_ADDC_U32_e64 %19, %17, %9, implicit %exec %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... --- # GCN-LABEL: name: shrink_addc_vop3{{$}} -# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit %vcc, implicit %exec +# GCN: %29 = V_ADDC_U32_e32 %19, %17, implicit-def %vcc, implicit %vcc, implicit %exec # GCN %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec name: shrink_addc_vop3 @@ -487,13 +402,13 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec %27 = REG_SEQUENCE %3, 1, %26, 2 %10 = S_MOV_B32 61440 @@ -502,19 +417,19 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit %exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec %vcc = S_MOV_B64 0 %29, %vcc = V_ADDC_U32_e64 %19, %17, %vcc, implicit %exec %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... --- # GCN-LABEL: name: shrink_addc_undef_vcc{{$}} -# GCN: %29 = V_ADDC_U32_e32 %17, %19, implicit-def %vcc, implicit undef %vcc, implicit %exec +# GCN: %29 = V_ADDC_U32_e32 %19, %17, implicit-def %vcc, implicit undef %vcc, implicit %exec # GCN: %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec name: shrink_addc_undef_vcc alignment: 0 @@ -572,13 +487,13 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | - bb.0 (%ir-block.0): + bb.0: liveins: %sgpr0_sgpr1, %vgpr0 %3 = COPY %vgpr0 %0 = COPY %sgpr0_sgpr1 - %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) - %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 + %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 %26 = V_ASHRREV_I32_e32 31, %3, implicit %exec %27 = REG_SEQUENCE %3, 1, %26, 2 %10 = S_MOV_B32 61440 @@ -587,11 +502,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit %exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.a.ptr) - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec :: (volatile load 4 from %ir.b.ptr) + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, implicit %exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, implicit %exec %29, %vcc = V_ADDC_U32_e64 %19, %17, undef %vcc, implicit %exec %24 = V_CNDMASK_B32_e64 0, 1, killed %vcc, implicit %exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out.gep) + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, implicit %exec S_ENDPGM ... diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index 348c7200c0bc1..17109187d5387 100644 --- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) diff --git a/test/CodeGen/AMDGPU/sign_extend.ll b/test/CodeGen/AMDGPU/sign_extend.ll index 3e452c214e983..c80945f390bed 100644 --- a/test/CodeGen/AMDGPU/sign_extend.ll +++ b/test/CodeGen/AMDGPU/sign_extend.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}s_sext_i1_to_i32: ; GCN: v_cndmask_b32_e64 diff --git a/test/CodeGen/AMDGPU/sitofp.f16.ll b/test/CodeGen/AMDGPU/sitofp.f16.ll index 574d1c0b2c78e..0bcef99df39f6 100644 --- a/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}sitofp_i16_to_f16 ; GCN: buffer_load_{{sshort|ushort}} v[[A_I16:[0-9]+]] diff --git a/test/CodeGen/AMDGPU/sminmax.ll b/test/CodeGen/AMDGPU/sminmax.ll index 827d672022eba..41430715f3476 100644 --- a/test/CodeGen/AMDGPU/sminmax.ll +++ b/test/CodeGen/AMDGPU/sminmax.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}s_abs_i32: ; GCN: s_abs_i32 @@ -18,7 +18,7 @@ define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind ; FUNC-LABEL: {{^}}v_abs_i32: ; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG]], [[SRC]] +; GCN: v_max_i32_e32 {{v[0-9]+}}, [[SRC]], [[NEG]] ; GCN: v_add_i32 ; EG: MAX_INT @@ -34,7 +34,7 @@ define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* % ; GCN-LABEL: {{^}}v_abs_i32_repeat_user: ; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]] -; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[NEG]], [[SRC]] +; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[SRC]], [[NEG]] ; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MAX]], [[MAX]] define amdgpu_kernel void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { %val = load i32, i32 addrspace(1)* %src, align 4 @@ -71,8 +71,8 @@ define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> % ; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] ; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] -; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] -; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC0]], [[NEG0]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC1]], [[NEG1]] ; GCN: v_add_i32 ; GCN: v_add_i32 @@ -132,10 +132,10 @@ define amdgpu_kernel void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> % ; GCN-DAG: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]] ; GCN-DAG: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]] -; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] -; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] -; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]] -; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC0]], [[NEG0]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC1]], [[NEG1]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC2]], [[NEG2]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC3]], [[NEG3]] ; GCN: v_add_i32 ; GCN: v_add_i32 @@ -184,8 +184,8 @@ define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace( ; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]] -; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]] +; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL0]], [[VAL1]] +; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL0]], [[VAL1]] define amdgpu_kernel void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind { %val0 = load volatile i32, i32 addrspace(1)* %ptr0 %val1 = load volatile i32, i32 addrspace(1)* %ptr1 diff --git a/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/test/CodeGen/AMDGPU/sminmax.v2i16.ll index a9aac2d8abb75..27263429650d8 100644 --- a/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s ; GCN-LABEL: {{^}}s_abs_v2i16: ; GFX9: s_load_dword [[VAL:s[0-9]+]] diff --git a/test/CodeGen/AMDGPU/spill-cfg-position.ll b/test/CodeGen/AMDGPU/spill-cfg-position.ll index 1ca0919258a8e..cbf9f37e29ef7 100644 --- a/test/CodeGen/AMDGPU/spill-cfg-position.ll +++ b/test/CodeGen/AMDGPU/spill-cfg-position.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -stress-regalloc=6 < %s | FileCheck %s ; Inline spiller can decide to move a spill as early as possible in the basic block. ; It will skip phis and label, but we also need to make sure it skips instructions diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll index 44cfdf6398aef..74618b263bad7 100644 --- a/test/CodeGen/AMDGPU/sra.ll +++ b/test/CodeGen/AMDGPU/sra.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() #0 diff --git a/test/CodeGen/AMDGPU/srem.ll b/test/CodeGen/AMDGPU/srem.ll index e067258920892..51eaf9a960b00 100644 --- a/test/CodeGen/AMDGPU/srem.ll +++ b/test/CodeGen/AMDGPU/srem.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll index cb40ecf2de1ca..8878b45385556 100644 --- a/test/CodeGen/AMDGPU/srl.ll +++ b/test/CodeGen/AMDGPU/srl.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() #0 diff --git a/test/CodeGen/AMDGPU/ssubo.ll b/test/CodeGen/AMDGPU/ssubo.ll index 135632343f909..d65c2adc7e202 100644 --- a/test/CodeGen/AMDGPU/ssubo.ll +++ b/test/CodeGen/AMDGPU/ssubo.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone diff --git a/test/CodeGen/AMDGPU/sub.i16.ll b/test/CodeGen/AMDGPU/sub.i16.ll index 1d407ea9bcda6..14bedceed6eee 100644 --- a/test/CodeGen/AMDGPU/sub.i16.ll +++ b/test/CodeGen/AMDGPU/sub.i16.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: {{^}}v_test_sub_i16: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: buffer_store_short [[ADD]] define amdgpu_kernel void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -68,7 +68,7 @@ define amdgpu_kernel void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i32: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: buffer_store_dword [[ADD]] define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -88,7 +88,7 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1 ; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] +; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]] ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -107,7 +107,7 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i1 ; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i32: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: buffer_store_dword [[SEXT]] define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { @@ -127,7 +127,7 @@ define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i1 ; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i64: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll index e7655df155204..46f1b120f2127 100644 --- a/test/CodeGen/AMDGPU/sub.ll +++ b/test/CodeGen/AMDGPU/sub.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.r600.read.tidig.x() readnone diff --git a/test/CodeGen/AMDGPU/sub.v2i16.ll b/test/CodeGen/AMDGPU/sub.v2i16.ll index ee923e2b8b611..8d5c8b64efb83 100644 --- a/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -6,7 +6,7 @@ ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -165,10 +165,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace( ; VI: flat_load_ushort v[[B_HI:[0-9]+]] ; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: v_subrev_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]] +; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] ; VI-NOT: and ; VI-NOT: shl -; VI: v_subrev_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]] +; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] ; VI-NOT: and ; VI-NOT: shl ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} @@ -201,8 +201,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; VI: flat_load_ushort v[[B_LO:[0-9]+]] ; VI: flat_load_ushort v[[B_HI:[0-9]+]] -; VI-DAG: v_subrev_u16_e32 -; VI-DAG: v_subrev_u16_e32 +; VI: v_sub_u16_e32 +; VI: v_sub_u16_e32 ; VI: buffer_store_dwordx4 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { @@ -228,8 +228,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} -; VI: v_subrev_u16_e32 -; VI: v_subrev_u16_e32 +; VI: v_sub_u16_e32 +; VI: v_sub_u16_e32 ; VI: buffer_store_dwordx2 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -253,7 +253,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1) ; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_subrev_u16_e32 +; VI: v_sub_u16_e32 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 diff --git a/test/CodeGen/AMDGPU/syncscopes.ll b/test/CodeGen/AMDGPU/syncscopes.ll new file mode 100644 index 0000000000000..3741ce788993e --- /dev/null +++ b/test/CodeGen/AMDGPU/syncscopes.ll @@ -0,0 +1,19 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -stop-before=si-debugger-insert-nops < %s | FileCheck --check-prefix=GCN %s + +; GCN-LABEL: name: syncscopes +; GCN: FLAT_STORE_DWORD killed %vgpr1_vgpr2, killed %vgpr0, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("agent") seq_cst 4 into %ir.agent_out) +; GCN: FLAT_STORE_DWORD killed %vgpr4_vgpr5, killed %vgpr3, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out) +; GCN: FLAT_STORE_DWORD killed %vgpr7_vgpr8, killed %vgpr6, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out) +define void @syncscopes( + i32 %agent, + i32 addrspace(4)* %agent_out, + i32 %workgroup, + i32 addrspace(4)* %workgroup_out, + i32 %wavefront, + i32 addrspace(4)* %wavefront_out) { +entry: + store atomic i32 %agent, i32 addrspace(4)* %agent_out syncscope("agent") seq_cst, align 4 + store atomic i32 %workgroup, i32 addrspace(4)* %workgroup_out syncscope("workgroup") seq_cst, align 4 + store atomic i32 %wavefront, i32 addrspace(4)* %wavefront_out syncscope("wavefront") seq_cst, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll index f90040385f753..77a6820713d6d 100644 --- a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll +++ b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s ; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32: ; CHECK: buffer_load_dword v diff --git a/test/CodeGen/AMDGPU/trunc.ll b/test/CodeGen/AMDGPU/trunc.ll index 0c91d52df0c08..da038f4b05972 100644 --- a/test/CodeGen/AMDGPU/trunc.ll +++ b/test/CodeGen/AMDGPU/trunc.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s declare i32 @llvm.r600.read.tidig.x() nounwind readnone diff --git a/test/CodeGen/AMDGPU/uaddo.ll b/test/CodeGen/AMDGPU/uaddo.ll index 632ccaa7e6124..5754bd9bb913a 100644 --- a/test/CodeGen/AMDGPU/uaddo.ll +++ b/test/CodeGen/AMDGPU/uaddo.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}s_uaddo_i64_zext: ; GCN: s_add_u32 @@ -58,8 +58,8 @@ define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* } ; FUNC-LABEL: {{^}}v_uaddo_i32_novcc: -; GCN: v_add_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]] +; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc ; EG: ADDC_UINT ; EG: ADD_INT diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll index d9dab0d40acf6..1d683776bfd5a 100644 --- a/test/CodeGen/AMDGPU/udiv.ll +++ b/test/CodeGen/AMDGPU/udiv.ll @@ -1,9 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}udiv_i32: ; EG-NOT: SETGE_INT diff --git a/test/CodeGen/AMDGPU/uitofp.f16.ll b/test/CodeGen/AMDGPU/uitofp.f16.ll index 0c3b0fcaf8549..eaa1d073cafb4 100644 --- a/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}uitofp_i16_to_f16 ; GCN: buffer_load_ushort v[[A_I16:[0-9]+]] diff --git a/test/CodeGen/AMDGPU/urem.ll b/test/CodeGen/AMDGPU/urem.ll index fb4eab43a2d66..823c918dcda70 100644 --- a/test/CodeGen/AMDGPU/urem.ll +++ b/test/CodeGen/AMDGPU/urem.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; The code generated by urem is long and complex and may frequently ; change. The goal of this test is to make sure the ISel doesn't fail diff --git a/test/CodeGen/AMDGPU/usubo.ll b/test/CodeGen/AMDGPU/usubo.ll index d1f454f0bc655..f01bf498e0d8a 100644 --- a/test/CodeGen/AMDGPU/usubo.ll +++ b/test/CodeGen/AMDGPU/usubo.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}s_usubo_i64_zext: ; GCN: s_sub_u32 @@ -58,8 +58,8 @@ define amdgpu_kernel void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* } ; FUNC-LABEL: {{^}}v_usubo_i32_novcc: -; GCN: v_sub_i32_e64 v{{[0-9]+}}, [[COND:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[COND]] +; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT @@ -120,7 +120,7 @@ define amdgpu_kernel void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* } ; FUNC-LABEL: {{^}}v_usubo_i16: -; VI: v_subrev_u16_e32 +; VI: v_sub_u16_e32 ; VI: v_cmp_gt_u16_e32 define amdgpu_kernel void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/test/CodeGen/AMDGPU/v_cndmask.ll b/test/CodeGen/AMDGPU/v_cndmask.ll index d4a68a418ee41..5cbfae34e1bb5 100644 --- a/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/test/CodeGen/AMDGPU/v_cndmask.ll @@ -200,9 +200,9 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* % ; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc ; SI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc -; VI-DAG: v_cmp_lt_i64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}} -; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[Z_HI]], s -; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, v[[Z_LO]], s +; VI-DAG: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}} +; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc +; VI-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -292,10 +292,10 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrs ; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i1: ; GCN: load_dword ; GCN: load_ubyte -; GCN-DAG: v_cmp_gt_i32_e64 s{{\[[0-9]+:[0-9]+\]}}, 0, v +; GCN-DAG: v_cmp_gt_i32_e32 vcc, 0, v ; DCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 1, -; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1, v -; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, vcc +; GCN-DAG: v_cmp_eq_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, v +; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, s{{\[[0-9]+:[0-9]+\]}} ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s ; GCN: store_byte define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 { diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll index 2b96f7d50076a..da57155f33ef1 100644 --- a/test/CodeGen/AMDGPU/v_mac.ll +++ b/test/CodeGen/AMDGPU/v_mac.ll @@ -1,12 +1,12 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s ; GCN-LABEL: {{^}}mac_vvv: ; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}} ; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8 -; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]] +; GCN: v_mac_f32_e32 [[C]], [[A]], [[B]] ; GCN: buffer_store_dword [[C]] define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: @@ -135,7 +135,7 @@ entry: ; GCN-LABEL: {{^}}safe_mad_sub0_src0: ; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0, -; GCN: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SUB0]] +; GCN: v_mac_f32_e32 v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}} define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 diff --git a/test/CodeGen/AMDGPU/v_mac_f16.ll b/test/CodeGen/AMDGPU/v_mac_f16.ll index ce4a69db35060..46c9b7ee1a3d5 100644 --- a/test/CodeGen/AMDGPU/v_mac_f16.ll +++ b/test/CodeGen/AMDGPU/v_mac_f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}mac_f16: ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] @@ -8,10 +8,10 @@ ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_mac_f32_e32 v[[C_F32]], v[[B_F32]], v[[A_F32]] +; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] ; SI: buffer_store_short v[[R_F16]] -; VI: v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]] +; VI: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] ; VI: buffer_store_short v[[C_F16]] ; GCN: s_endpgm define amdgpu_kernel void @mac_f16( @@ -147,9 +147,9 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math: ; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] +; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] +; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} ; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math( half addrspace(1)* %r, @@ -171,9 +171,9 @@ entry: ; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math: ; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} +; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} +; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] ; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math( half addrspace(1)* %r, @@ -312,20 +312,20 @@ entry: ; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] -; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]] +; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] -; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]] +; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; VI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] ; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] ; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]] ; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]] ; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[C_V2_F16]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]] ; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -481,14 +481,14 @@ entry: ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} ; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} ; VI-DAG: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] +; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} ; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math( @@ -513,14 +513,14 @@ entry: ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] +; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} ; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} +; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] ; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math( diff --git a/test/CodeGen/AMDGPU/vectorize-global-local.ll b/test/CodeGen/AMDGPU/vectorize-global-local.ll index 90cf34e609f6e..381ff5b1b518a 100644 --- a/test/CodeGen/AMDGPU/vectorize-global-local.ll +++ b/test/CodeGen/AMDGPU/vectorize-global-local.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s ; CHECK-DAG: flat_load_dwordx4 ; CHECK-DAG: flat_load_dwordx4 ; CHECK-DAG: flat_load_dwordx4 diff --git a/test/CodeGen/AMDGPU/vop-shrink-frame-index.mir b/test/CodeGen/AMDGPU/vop-shrink-frame-index.mir new file mode 100644 index 0000000000000..f8a2339626cf1 --- /dev/null +++ b/test/CodeGen/AMDGPU/vop-shrink-frame-index.mir @@ -0,0 +1,161 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-shrink-instructions -o - %s | FileCheck -check-prefix=GCN %s +--- | + + define amdgpu_kernel void @fold_fi_vgpr() { + %alloca = alloca [4 x i32] + ret void + } + + define amdgpu_kernel void @fold_vgpr_fi() { + %alloca = alloca [4 x i32] + ret void + } + + define amdgpu_kernel void @fold_sgpr_fi() { + %alloca = alloca [4 x i32] + ret void + } + + define amdgpu_kernel void @fold_fi_sgpr() { + %alloca = alloca [4 x i32] + ret void + } + + define amdgpu_kernel void @fold_fi_imm() { + %alloca = alloca [4 x i32] + ret void + } + + define amdgpu_kernel void @fold_imm_fi() { + %alloca = alloca [4 x i32] + ret void + } + +... +# GCN-LABEL: name: fold_fi_vgpr{{$}} +# GCN: %1 = IMPLICIT_DEF + +# GCN: %2 = V_ADD_I32_e32 %stack.0.alloca, %1, implicit-def %vcc, implicit %exec +name: fold_fi_vgpr +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +stack: + - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8, + callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '', + di-location: '' } +body: | + bb.0: + %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec + %1 = IMPLICIT_DEF + %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec + S_ENDPGM + +... +# GCN-LABEL: name: fold_vgpr_fi{{$}} +# GCN: %1 = IMPLICIT_DEF +# GCN: %2 = V_ADD_I32_e32 %stack.0.alloca, %1, implicit-def %vcc, implicit %exec +name: fold_vgpr_fi +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +stack: + - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8, + callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '', + di-location: '' } +body: | + bb.0: + %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec + %1 = IMPLICIT_DEF + %2, %vcc = V_ADD_I32_e64 %1, %0, implicit %exec + S_ENDPGM + +... +# GCN-LABEL: name: fold_sgpr_fi{{$}} +# GCN: %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec +# GCN: %1 = IMPLICIT_DEF +# GCN: %2 = V_ADD_I32_e32 %1, %0, implicit-def %vcc, implicit %exec +name: fold_sgpr_fi +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: sgpr_32 } + - { id: 2, class: vgpr_32 } +stack: + - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8, + callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '', + di-location: '' } +body: | + bb.0: + %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec + %1 = IMPLICIT_DEF + %2, %vcc = V_ADD_I32_e64 %1, %0, implicit %exec + S_ENDPGM + +... +# GCN-LABEL: name: fold_fi_sgpr{{$}} +# GCN: %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec +# GCN: %1 = IMPLICIT_DEF +# GCN: %2 = V_ADD_I32_e32 %1, %0, implicit-def %vcc, implicit %exec +name: fold_fi_sgpr +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: sgpr_32 } + - { id: 2, class: vgpr_32 } +stack: + - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8, + callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '', + di-location: '' } +body: | + bb.0: + %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec + %1 = IMPLICIT_DEF + %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec + S_ENDPGM +... +# TODO: Should probably prefer folding immediate first +# GCN-LABEL: name: fold_fi_imm{{$}} +# GCN: %1 = V_MOV_B32_e32 999, implicit %exec +# GCN: %2 = V_ADD_I32_e32 %stack.0.alloca, %1, implicit-def %vcc, implicit %exec +name: fold_fi_imm +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +stack: + - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8, + callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '', + di-location: '' } +body: | + bb.0: + %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec + %1 = V_MOV_B32_e32 999, implicit %exec + %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec + S_ENDPGM + +... +# GCN-LABEL: name: fold_imm_fi{{$}} +# GCN: %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec +# GCN: %2 = V_ADD_I32_e32 999, %0, implicit-def %vcc, implicit %exec +name: fold_imm_fi +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +stack: + - { id: 0, name: alloca, type: default, offset: 0, size: 128, alignment: 8, + callee-saved-register: '', local-offset: 0, di-variable: '', di-expression: '', + di-location: '' } +body: | + bb.0: + %0 = V_MOV_B32_e32 %stack.0.alloca, implicit %exec + %1 = V_MOV_B32_e32 999, implicit %exec + %2, %vcc = V_ADD_I32_e64 %1, %0, implicit %exec + S_ENDPGM diff --git a/test/CodeGen/AMDGPU/vop-shrink-non-ssa.mir b/test/CodeGen/AMDGPU/vop-shrink-non-ssa.mir new file mode 100644 index 0000000000000..b4c0c93347c20 --- /dev/null +++ b/test/CodeGen/AMDGPU/vop-shrink-non-ssa.mir @@ -0,0 +1,40 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-shrink-instructions -o - %s | FileCheck -check-prefix=GCN %s +... +# GCN-LABEL: name: fold_imm_non_ssa{{$}} +# GCN: %0 = V_MOV_B32_e32 123, implicit %exec +# GCN: %2 = V_ADD_I32_e32 456, %0, implicit-def %vcc, implicit %exec + +name: fold_imm_non_ssa +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } +body: | + bb.0: + %0 = COPY undef %0 + %0 = V_MOV_B32_e32 123, implicit %exec + %1 = V_MOV_B32_e32 456, implicit %exec + %2, %vcc = V_ADD_I32_e64 %0, %1, implicit %exec + S_ENDPGM + +... +# GCN-LABEL: name: fold_partially_defined_superreg{{$}} +# GCN: %1 = V_MOV_B32_e32 456, implicit %exec +# GCN: %2 = V_ADD_I32_e32 123, %1, implicit-def %vcc, implicit %exec +name: fold_partially_defined_superreg +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: vreg_64 } +body: | + bb.0: + undef %3.sub0 = V_MOV_B32_e32 123, implicit %exec, implicit-def %3 + %1 = V_MOV_B32_e32 456, implicit %exec + %2, %vcc = V_ADD_I32_e64 %3.sub0, %1, implicit %exec + S_ENDPGM + +... diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll index bb6234729f90b..02ffd30be5fda 100644 --- a/test/CodeGen/AMDGPU/vselect.ll +++ b/test/CodeGen/AMDGPU/vselect.ll @@ -7,7 +7,9 @@ ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y -; SI: v_cndmask_b32_e64 +; SI: v_cmp_gt_i32_e32 vcc +; SI: v_cndmask_b32_e32 +; SI: v_cmp_gt_i32_e32 vcc ; SI: v_cndmask_b32_e32 define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) { @@ -25,8 +27,11 @@ entry: ; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e32 + +; SI: v_cmp_neq_f32_e32 vcc +; SI: v_cndmask_b32_e32 +; SI: v_cmp_neq_f32_e32 vcc +; SI: v_cndmask_b32_e32 define amdgpu_kernel void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) { entry: @@ -45,12 +50,10 @@ entry: ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y -; FIXME: The shrinking does not happen on tonga - -; SI: v_cndmask_b32 -; SI: v_cndmask_b32 -; SI: v_cndmask_b32 -; SI: v_cndmask_b32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) { entry: @@ -68,6 +71,10 @@ entry: ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 define amdgpu_kernel void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) { entry: %0 = load <4 x float>, <4 x float> addrspace(1)* %in0 diff --git a/test/CodeGen/AMDGPU/waitcnt-permute.mir b/test/CodeGen/AMDGPU/waitcnt-permute.mir index 44dbd38f2d300..5612c7cac00b7 100644 --- a/test/CodeGen/AMDGPU/waitcnt-permute.mir +++ b/test/CodeGen/AMDGPU/waitcnt-permute.mir @@ -1,18 +1,6 @@ # RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s ---- | - define float @waitcnt-permute(i32 %x, i32 %y) { - entry: - %0 = call i32 @llvm.amdgcn.ds.bpermute(i32 %x, i32 %y) - %1 = bitcast i32 %0 to float - %2 = fadd float 1.000000e+00, %1 - ret float %2 - } - - declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) - ... ---- # CHECK-LABEL: name: waitcnt-permute{{$}} # CHECK: DS_BPERMUTE_B32 # CHECK-NEXT: S_WAITCNT 127 diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll index 57a082a0170c3..847a1d7393215 100644 --- a/test/CodeGen/AMDGPU/xor.ll +++ b/test/CodeGen/AMDGPU/xor.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}xor_v2i32: @@ -60,7 +60,7 @@ define amdgpu_kernel void @xor_i1(float addrspace(1)* %out, float addrspace(1)* ; FUNC-LABEL: {{^}}v_xor_i1: ; SI: buffer_load_ubyte [[B:v[0-9]+]] ; SI: buffer_load_ubyte [[A:v[0-9]+]] -; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]] +; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[B]], [[A]] ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]] ; SI: buffer_store_byte [[RESULT]] define amdgpu_kernel void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) { diff --git a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll index a902234898cd0..69c42afb9ad5a 100644 --- a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll +++ b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll @@ -6,7 +6,7 @@ ; GCN-NOT: _or_ ; GCN-NOT: v[[HI]] ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 -; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]] +; GCN: v_or_b32_e32 v[[LO]], v[[LO]], v[[LD32]] ; GCN-NOT: _or_ ; GCN-NOT: v[[HI]] ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 @@ -26,7 +26,7 @@ define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrs ; GCN-NOT: _or_ ; GCN-NOT: v[[HI]] ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 -; GCN: v_or_b32_e32 v[[LO]], v[[LD32]], v[[LO]] +; GCN: v_or_b32_e32 v[[LO]], v[[LO]], v[[LD32]] ; GCN-NOT: v[[HI]] ; GCN-NOT: _or_ ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 diff --git a/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll b/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll index 9dcfe5007c006..ed5255bfbebd4 100644 --- a/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll +++ b/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll @@ -6,23 +6,23 @@ ; CHECK: ** List Scheduling ; CHECK: SU(2){{.*}}STR{{.*}}Volatile -; CHECK-NOT: ord SU -; CHECK: ord SU(3): Latency=1 -; CHECK-NOT: ord SU +; CHECK-NOT: SU({{.*}}): Ord +; CHECK: SU(3): Ord Latency=1 +; CHECK-NOT: SU({{.*}}): Ord ; CHECK: SU(3){{.*}}LDR{{.*}}Volatile -; CHECK-NOT: ord SU -; CHECK: ord SU(2): Latency=1 -; CHECK-NOT: ord SU +; CHECK-NOT: SU({{.*}}): Ord +; CHECK: SU(2): Ord Latency=1 +; CHECK-NOT: SU({{.*}}): Ord ; CHECK: Successors: ; CHECK: ** List Scheduling ; CHECK: SU(2){{.*}}STR{{.*}} -; CHECK-NOT: ord SU -; CHECK: ord SU(3): Latency=1 -; CHECK-NOT: ord SU +; CHECK-NOT: SU({{.*}}): Ord +; CHECK: SU(3): Ord Latency=1 +; CHECK-NOT: SU({{.*}}): Ord ; CHECK: SU(3){{.*}}LDR{{.*}} -; CHECK-NOT: ord SU -; CHECK: ord SU(2): Latency=1 -; CHECK-NOT: ord SU +; CHECK-NOT: SU({{.*}}): Ord +; CHECK: SU(2): Ord Latency=1 +; CHECK-NOT: SU({{.*}}): Ord ; CHECK: Successors: define i32 @f1(i32* nocapture %p1, i32* nocapture %p2) nounwind { entry: diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-cmp.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-cmp.mir index 111375ece51ba..6c8bc7123a1ab 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select-cmp.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select-cmp.mir @@ -10,6 +10,46 @@ define void @test_icmp_sge_s32() { ret void } define void @test_icmp_slt_s32() { ret void } define void @test_icmp_sle_s32() { ret void } + + define void @test_fcmp_true_s32() #0 { ret void } + define void @test_fcmp_false_s32() #0 { ret void } + + define void @test_fcmp_oeq_s32() #0 { ret void } + define void @test_fcmp_ogt_s32() #0 { ret void } + define void @test_fcmp_oge_s32() #0 { ret void } + define void @test_fcmp_olt_s32() #0 { ret void } + define void @test_fcmp_ole_s32() #0 { ret void } + define void @test_fcmp_ord_s32() #0 { ret void } + define void @test_fcmp_ugt_s32() #0 { ret void } + define void @test_fcmp_uge_s32() #0 { ret void } + define void @test_fcmp_ult_s32() #0 { ret void } + define void @test_fcmp_ule_s32() #0 { ret void } + define void @test_fcmp_une_s32() #0 { ret void } + define void @test_fcmp_uno_s32() #0 { ret void } + + define void @test_fcmp_one_s32() #0 { ret void } + define void @test_fcmp_ueq_s32() #0 { ret void } + + define void @test_fcmp_true_s64() #0 { ret void } + define void @test_fcmp_false_s64() #0 { ret void } + + define void @test_fcmp_oeq_s64() #0 { ret void } + define void @test_fcmp_ogt_s64() #0 { ret void } + define void @test_fcmp_oge_s64() #0 { ret void } + define void @test_fcmp_olt_s64() #0 { ret void } + define void @test_fcmp_ole_s64() #0 { ret void } + define void @test_fcmp_ord_s64() #0 { ret void } + define void @test_fcmp_ugt_s64() #0 { ret void } + define void @test_fcmp_uge_s64() #0 { ret void } + define void @test_fcmp_ult_s64() #0 { ret void } + define void @test_fcmp_ule_s64() #0 { ret void } + define void @test_fcmp_une_s64() #0 { ret void } + define void @test_fcmp_uno_s64() #0 { ret void } + + define void @test_fcmp_one_s64() #0 { ret void } + define void @test_fcmp_ueq_s64() #0 { ret void } + + attributes #0 = { "target-features"="+vfp2" } ... --- name: test_icmp_eq_s32 @@ -35,8 +75,8 @@ body: | %2(s1) = G_ICMP intpred(eq), %0(s32), %1 ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ - ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr - ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr + ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr %3(s32) = G_ZEXT %2(s1) ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ @@ -71,8 +111,8 @@ body: | %2(s1) = G_ICMP intpred(ne), %0(s32), %1 ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ - ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr - ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 1, %cpsr + ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 1, %cpsr %3(s32) = G_ZEXT %2(s1) ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ @@ -107,8 +147,8 @@ body: | %2(s1) = G_ICMP intpred(ugt), %0(s32), %1 ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ - ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr - ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 8, %cpsr + ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 8, %cpsr %3(s32) = G_ZEXT %2(s1) ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ @@ -143,8 +183,8 @@ body: | %2(s1) = G_ICMP intpred(uge), %0(s32), %1 ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ - ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr - ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 2, %cpsr + ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 2, %cpsr %3(s32) = G_ZEXT %2(s1) ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ @@ -179,8 +219,8 @@ body: | %2(s1) = G_ICMP intpred(ult), %0(s32), %1 ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ - ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr - ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 3, %cpsr + ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 3, %cpsr %3(s32) = G_ZEXT %2(s1) ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ @@ -215,8 +255,8 @@ body: | %2(s1) = G_ICMP intpred(ule), %0(s32), %1 ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ - ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr - ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 9, %cpsr + ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 9, %cpsr %3(s32) = G_ZEXT %2(s1) ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ @@ -251,8 +291,8 @@ body: | %2(s1) = G_ICMP intpred(sgt), %0(s32), %1 ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ - ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr - ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr + ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr %3(s32) = G_ZEXT %2(s1) ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ @@ -287,8 +327,8 @@ body: | %2(s1) = G_ICMP intpred(sge), %0(s32), %1 ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ - ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr - ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 10, %cpsr + ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 10, %cpsr %3(s32) = G_ZEXT %2(s1) ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ @@ -323,8 +363,8 @@ body: | %2(s1) = G_ICMP intpred(slt), %0(s32), %1 ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ - ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr - ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 11, %cpsr + ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 11, %cpsr %3(s32) = G_ZEXT %2(s1) ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ @@ -359,8 +399,1180 @@ body: | %2(s1) = G_ICMP intpred(sle), %0(s32), %1 ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ - ; CHECK: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr - ; CHECK: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 13, %cpsr + ; CHECK-NEXT: CMPrr [[VREGX]], [[VREGY]], 14, _, implicit-def %cpsr + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 13, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_true_s32 +# CHECK-LABEL: name: test_fcmp_true_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + %1(s32) = COPY %s1 + + %2(s1) = G_FCMP floatpred(true), %0(s32), %1 + ; CHECK: [[RES:%[0-9]+]] = MOVi 1, 14, _, _ + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_false_s32 +# CHECK-LABEL: name: test_fcmp_false_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + %1(s32) = COPY %s1 + + %2(s1) = G_FCMP floatpred(false), %0(s32), %1 + ; CHECK: [[RES:%[0-9]+]] = MOVi 0, 14, _, _ + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_oeq_s32 +# CHECK-LABEL: name: test_fcmp_oeq_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0 + + %1(s32) = COPY %s1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1 + + %2(s1) = G_FCMP floatpred(oeq), %0(s32), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ogt_s32 +# CHECK-LABEL: name: test_fcmp_ogt_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0 + + %1(s32) = COPY %s1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1 + + %2(s1) = G_FCMP floatpred(ogt), %0(s32), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_oge_s32 +# CHECK-LABEL: name: test_fcmp_oge_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0 + + %1(s32) = COPY %s1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1 + + %2(s1) = G_FCMP floatpred(oge), %0(s32), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 10, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_olt_s32 +# CHECK-LABEL: name: test_fcmp_olt_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0 + + %1(s32) = COPY %s1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1 + + %2(s1) = G_FCMP floatpred(olt), %0(s32), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 4, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ole_s32 +# CHECK-LABEL: name: test_fcmp_ole_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0 + + %1(s32) = COPY %s1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1 + + %2(s1) = G_FCMP floatpred(ole), %0(s32), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 9, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ord_s32 +# CHECK-LABEL: name: test_fcmp_ord_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0 + + %1(s32) = COPY %s1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1 + + %2(s1) = G_FCMP floatpred(ord), %0(s32), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 7, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ugt_s32 +# CHECK-LABEL: name: test_fcmp_ugt_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0 + + %1(s32) = COPY %s1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1 + + %2(s1) = G_FCMP floatpred(ugt), %0(s32), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 8, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_uge_s32 +# CHECK-LABEL: name: test_fcmp_uge_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0 + + %1(s32) = COPY %s1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1 + + %2(s1) = G_FCMP floatpred(uge), %0(s32), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 5, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ult_s32 +# CHECK-LABEL: name: test_fcmp_ult_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0 + + %1(s32) = COPY %s1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1 + + %2(s1) = G_FCMP floatpred(ult), %0(s32), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 11, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ule_s32 +# CHECK-LABEL: name: test_fcmp_ule_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0 + + %1(s32) = COPY %s1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1 + + %2(s1) = G_FCMP floatpred(ule), %0(s32), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 13, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_une_s32 +# CHECK-LABEL: name: test_fcmp_une_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0 + + %1(s32) = COPY %s1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1 + + %2(s1) = G_FCMP floatpred(une), %0(s32), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 1, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_uno_s32 +# CHECK-LABEL: name: test_fcmp_uno_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0 + + %1(s32) = COPY %s1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1 + + %2(s1) = G_FCMP floatpred(uno), %0(s32), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 6, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_one_s32 +# CHECK-LABEL: name: test_fcmp_one_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0 + + %1(s32) = COPY %s1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1 + + %2(s1) = G_FCMP floatpred(one), %0(s32), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES1:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[RES1]], 1, 4, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ueq_s32 +# CHECK-LABEL: name: test_fcmp_ueq_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0 + + %1(s32) = COPY %s1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1 + + %2(s1) = G_FCMP floatpred(ueq), %0(s32), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES1:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr + ; CHECK-NEXT: VCMPS [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[RES1]], 1, 6, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_true_s64 +# CHECK-LABEL: name: test_fcmp_true_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + %1(s64) = COPY %d1 + + %2(s1) = G_FCMP floatpred(true), %0(s64), %1 + ; CHECK: [[RES:%[0-9]+]] = MOVi 1, 14, _, _ + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_false_s64 +# CHECK-LABEL: name: test_fcmp_false_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + %1(s64) = COPY %d1 + + %2(s1) = G_FCMP floatpred(false), %0(s64), %1 + ; CHECK: [[RES:%[0-9]+]] = MOVi 0, 14, _, _ + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_oeq_s64 +# CHECK-LABEL: name: test_fcmp_oeq_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0 + + %1(s64) = COPY %d1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1 + + %2(s1) = G_FCMP floatpred(oeq), %0(s64), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ogt_s64 +# CHECK-LABEL: name: test_fcmp_ogt_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0 + + %1(s64) = COPY %d1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1 + + %2(s1) = G_FCMP floatpred(ogt), %0(s64), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_oge_s64 +# CHECK-LABEL: name: test_fcmp_oge_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0 + + %1(s64) = COPY %d1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1 + + %2(s1) = G_FCMP floatpred(oge), %0(s64), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 10, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_olt_s64 +# CHECK-LABEL: name: test_fcmp_olt_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0 + + %1(s64) = COPY %d1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1 + + %2(s1) = G_FCMP floatpred(olt), %0(s64), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 4, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ole_s64 +# CHECK-LABEL: name: test_fcmp_ole_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0 + + %1(s64) = COPY %d1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1 + + %2(s1) = G_FCMP floatpred(ole), %0(s64), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 9, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ord_s64 +# CHECK-LABEL: name: test_fcmp_ord_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0 + + %1(s64) = COPY %d1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1 + + %2(s1) = G_FCMP floatpred(ord), %0(s64), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 7, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ugt_s64 +# CHECK-LABEL: name: test_fcmp_ugt_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0 + + %1(s64) = COPY %d1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1 + + %2(s1) = G_FCMP floatpred(ugt), %0(s64), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 8, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_uge_s64 +# CHECK-LABEL: name: test_fcmp_uge_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0 + + %1(s64) = COPY %d1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1 + + %2(s1) = G_FCMP floatpred(uge), %0(s64), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 5, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ult_s64 +# CHECK-LABEL: name: test_fcmp_ult_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0 + + %1(s64) = COPY %d1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1 + + %2(s1) = G_FCMP floatpred(ult), %0(s64), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 11, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ule_s64 +# CHECK-LABEL: name: test_fcmp_ule_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0 + + %1(s64) = COPY %d1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1 + + %2(s1) = G_FCMP floatpred(ule), %0(s64), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 13, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_une_s64 +# CHECK-LABEL: name: test_fcmp_une_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0 + + %1(s64) = COPY %d1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1 + + %2(s1) = G_FCMP floatpred(une), %0(s64), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 1, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_uno_s64 +# CHECK-LABEL: name: test_fcmp_uno_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0 + + %1(s64) = COPY %d1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1 + + %2(s1) = G_FCMP floatpred(uno), %0(s64), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[ZERO]], 1, 6, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_one_s64 +# CHECK-LABEL: name: test_fcmp_one_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0 + + %1(s64) = COPY %d1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1 + + %2(s1) = G_FCMP floatpred(one), %0(s64), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES1:%[0-9]+]] = MOVCCi [[ZERO]], 1, 12, %cpsr + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[RES1]], 1, 4, %cpsr + + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ + + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[RET]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ueq_s64 +# CHECK-LABEL: name: test_fcmp_ueq_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: fprb } + - { id: 2, class: gprb } + - { id: 3, class: gprb } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0 + + %1(s64) = COPY %d1 + ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1 + + %2(s1) = G_FCMP floatpred(ueq), %0(s64), %1 + ; CHECK: [[ZERO:%[0-9]+]] = MOVi 0, 14, _, _ + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES1:%[0-9]+]] = MOVCCi [[ZERO]], 1, 0, %cpsr + ; CHECK-NEXT: VCMPD [[VREGX]], [[VREGY]], 14, _, implicit-def %fpscr_nzcv + ; CHECK-NEXT: FMSTAT 14, _, implicit-def %cpsr, implicit %fpscr_nzcv + ; CHECK-NEXT: [[RES:%[0-9]+]] = MOVCCi [[RES1]], 1, 6, %cpsr %3(s32) = G_ZEXT %2(s1) ; CHECK: [[RET:%[0-9]+]] = ANDri [[RES]], 1, 14, _, _ diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll b/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll index 7d021fdb43dd9..98b39e444ac77 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll @@ -49,3 +49,33 @@ define arm_aapcscc double @test_add_double(double %x, double %y) { %r = fadd double %x, %y ret double %r } + +define arm_aapcs_vfpcc i32 @test_cmp_float_ogt(float %x, float %y) { +; CHECK-LABEL: test_cmp_float_ogt +; HARD: vcmp.f32 +; HARD: vmrs APSR_nzcv, fpscr +; HARD-NEXT: movgt +; SOFT-AEABI: blx __aeabi_fcmpgt +; SOFT-DEFAULT: blx __gtsf2 +entry: + %v = fcmp ogt float %x, %y + %r = zext i1 %v to i32 + ret i32 %r +} + +define arm_aapcs_vfpcc i32 @test_cmp_float_one(float %x, float %y) { +; CHECK-LABEL: test_cmp_float_one +; HARD: vcmp.f32 +; HARD: vmrs APSR_nzcv, fpscr +; HARD: movgt +; HARD-NOT: vcmp +; HARD: movmi +; SOFT-AEABI-DAG: blx __aeabi_fcmpgt +; SOFT-AEABI-DAG: blx __aeabi_fcmplt +; SOFT-DEFAULT-DAG: blx __gtsf2 +; SOFT-DEFAULT-DAG: blx __ltsf2 +entry: + %v = fcmp one float %x, %y + %r = zext i1 %v to i32 + ret i32 %r +} diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir index c93e7fa0ec560..9a0877846fc3e 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-divmod.mir @@ -36,6 +36,7 @@ body: | %0(s32) = COPY %r0 %1(s32) = COPY %r1 ; HWDIV: [[R:%[0-9]+]](s32) = G_SDIV [[X]], [[Y]] + ; SOFT-NOT: G_SDIV ; SOFT: ADJCALLSTACKDOWN ; SOFT-DAG: %r0 = COPY [[X]] ; SOFT-DAG: %r1 = COPY [[Y]] @@ -44,6 +45,7 @@ body: | ; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 ; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0 ; SOFT: ADJCALLSTACKUP + ; SOFT-NOT: G_SDIV %2(s32) = G_SDIV %0, %1 ; CHECK: %r0 = COPY [[R]] %r0 = COPY %2(s32) @@ -70,6 +72,7 @@ body: | %0(s32) = COPY %r0 %1(s32) = COPY %r1 ; HWDIV: [[R:%[0-9]+]](s32) = G_UDIV [[X]], [[Y]] + ; SOFT-NOT: G_UDIV ; SOFT: ADJCALLSTACKDOWN ; SOFT-DAG: %r0 = COPY [[X]] ; SOFT-DAG: %r1 = COPY [[Y]] @@ -78,6 +81,7 @@ body: | ; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 ; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0 ; SOFT: ADJCALLSTACKUP + ; SOFT-NOT: G_UDIV %2(s32) = G_UDIV %0, %1 ; CHECK: %r0 = COPY [[R]] %r0 = COPY %2(s32) @@ -106,6 +110,7 @@ body: | %0(s16) = COPY %r0 %1(s16) = COPY %r1 ; HWDIV: [[R32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]] + ; SOFT-NOT: G_SDIV ; SOFT: ADJCALLSTACKDOWN ; SOFT-DAG: %r0 = COPY [[X32]] ; SOFT-DAG: %r1 = COPY [[Y32]] @@ -114,7 +119,9 @@ body: | ; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 ; SOFT: ADJCALLSTACKUP + ; SOFT-NOT: G_SDIV ; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]] + ; SOFT-NOT: G_SDIV %2(s16) = G_SDIV %0, %1 ; CHECK: %r0 = COPY [[R]] %r0 = COPY %2(s16) @@ -143,6 +150,7 @@ body: | %0(s16) = COPY %r0 %1(s16) = COPY %r1 ; HWDIV: [[R32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]] + ; SOFT-NOT: G_UDIV ; SOFT: ADJCALLSTACKDOWN ; SOFT-DAG: %r0 = COPY [[X32]] ; SOFT-DAG: %r1 = COPY [[Y32]] @@ -151,7 +159,9 @@ body: | ; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 ; SOFT: ADJCALLSTACKUP + ; SOFT-NOT: G_UDIV ; CHECK: [[R:%[0-9]+]](s16) = G_TRUNC [[R32]] + ; SOFT-NOT: G_UDIV %2(s16) = G_UDIV %0, %1 ; CHECK: %r0 = COPY [[R]] %r0 = COPY %2(s16) @@ -180,6 +190,7 @@ body: | %0(s8) = COPY %r0 %1(s8) = COPY %r1 ; HWDIV: [[R32:%[0-9]+]](s32) = G_SDIV [[X32]], [[Y32]] + ; SOFT-NOT: G_SDIV ; SOFT: ADJCALLSTACKDOWN ; SOFT-DAG: %r0 = COPY [[X32]] ; SOFT-DAG: %r1 = COPY [[Y32]] @@ -188,7 +199,9 @@ body: | ; SOFT-DEFAULT: BLX $__divsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 ; SOFT: ADJCALLSTACKUP + ; SOFT-NOT: G_SDIV ; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]] + ; SOFT-NOT: G_SDIV %2(s8) = G_SDIV %0, %1 ; CHECK: %r0 = COPY [[R]] %r0 = COPY %2(s8) @@ -217,6 +230,7 @@ body: | %0(s8) = COPY %r0 %1(s8) = COPY %r1 ; HWDIV: [[R32:%[0-9]+]](s32) = G_UDIV [[X32]], [[Y32]] + ; SOFT-NOT: G_UDIV ; SOFT: ADJCALLSTACKDOWN ; SOFT-DAG: %r0 = COPY [[X32]] ; SOFT-DAG: %r1 = COPY [[Y32]] @@ -225,7 +239,9 @@ body: | ; SOFT-DEFAULT: BLX $__udivsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 ; SOFT-DEFAULT: [[R32:%[0-9]+]](s32) = COPY %r0 ; SOFT: ADJCALLSTACKUP + ; SOFT-NOT: G_UDIV ; CHECK: [[R:%[0-9]+]](s8) = G_TRUNC [[R32]] + ; SOFT-NOT: G_UDIV %2(s8) = G_UDIV %0, %1 ; CHECK: %r0 = COPY [[R]] %r0 = COPY %2(s8) @@ -254,6 +270,7 @@ body: | ; HWDIV: [[Q:%[0-9]+]](s32) = G_SDIV [[X]], [[Y]] ; HWDIV: [[P:%[0-9]+]](s32) = G_MUL [[Q]], [[Y]] ; HWDIV: [[R:%[0-9]+]](s32) = G_SUB [[X]], [[P]] + ; SOFT-NOT: G_SREM ; SOFT: ADJCALLSTACKDOWN ; SOFT-DAG: %r0 = COPY [[X]] ; SOFT-DAG: %r1 = COPY [[Y]] @@ -262,6 +279,7 @@ body: | ; SOFT-DEFAULT: BLX $__modsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 ; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0 ; SOFT: ADJCALLSTACKUP + ; SOFT-NOT: G_SREM %2(s32) = G_SREM %0, %1 ; CHECK: %r0 = COPY [[R]] %r0 = COPY %2(s32) @@ -290,6 +308,7 @@ body: | ; HWDIV: [[Q:%[0-9]+]](s32) = G_UDIV [[X]], [[Y]] ; HWDIV: [[P:%[0-9]+]](s32) = G_MUL [[Q]], [[Y]] ; HWDIV: [[R:%[0-9]+]](s32) = G_SUB [[X]], [[P]] + ; SOFT-NOT: G_UREM ; SOFT: ADJCALLSTACKDOWN ; SOFT-DAG: %r0 = COPY [[X]] ; SOFT-DAG: %r1 = COPY [[Y]] @@ -298,6 +317,7 @@ body: | ; SOFT-DEFAULT: BLX $__umodsi3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 ; SOFT-DEFAULT: [[R:%[0-9]+]](s32) = COPY %r0 ; SOFT: ADJCALLSTACKUP + ; SOFT-NOT: G_UREM %2(s32) = G_UREM %0, %1 ; CHECK: %r0 = COPY [[R]] %r0 = COPY %2(s32) diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir index 803135ba595e4..cb61f95b10ce9 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir @@ -10,6 +10,44 @@ define void @test_fadd_float() { ret void } define void @test_fadd_double() { ret void } + + define void @test_fcmp_true_s32() { ret void } + define void @test_fcmp_false_s32() { ret void } + + define void @test_fcmp_oeq_s32() { ret void } + define void @test_fcmp_ogt_s32() { ret void } + define void @test_fcmp_oge_s32() { ret void } + define void @test_fcmp_olt_s32() { ret void } + define void @test_fcmp_ole_s32() { ret void } + define void @test_fcmp_ord_s32() { ret void } + define void @test_fcmp_ugt_s32() { ret void } + define void @test_fcmp_uge_s32() { ret void } + define void @test_fcmp_ult_s32() { ret void } + define void @test_fcmp_ule_s32() { ret void } + define void @test_fcmp_une_s32() { ret void } + define void @test_fcmp_uno_s32() { ret void } + + define void @test_fcmp_one_s32() { ret void } + define void @test_fcmp_ueq_s32() { ret void } + + define void @test_fcmp_true_s64() { ret void } + define void @test_fcmp_false_s64() { ret void } + + define void @test_fcmp_oeq_s64() { ret void } + define void @test_fcmp_ogt_s64() { ret void } + define void @test_fcmp_oge_s64() { ret void } + define void @test_fcmp_olt_s64() { ret void } + define void @test_fcmp_ole_s64() { ret void } + define void @test_fcmp_ord_s64() { ret void } + define void @test_fcmp_ugt_s64() { ret void } + define void @test_fcmp_uge_s64() { ret void } + define void @test_fcmp_ult_s64() { ret void } + define void @test_fcmp_ule_s64() { ret void } + define void @test_fcmp_une_s64() { ret void } + define void @test_fcmp_uno_s64() { ret void } + + define void @test_fcmp_one_s64() { ret void } + define void @test_fcmp_ueq_s64() { ret void } ... --- name: test_frem_float @@ -31,6 +69,7 @@ body: | ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 %0(s32) = COPY %r0 %1(s32) = COPY %r1 + ; CHECK-NOT: G_FREM ; CHECK: ADJCALLSTACKDOWN ; SOFT-DAG: %r0 = COPY [[X]] ; SOFT-DAG: %r1 = COPY [[Y]] @@ -41,6 +80,7 @@ body: | ; SOFT: [[R:%[0-9]+]](s32) = COPY %r0 ; HARD: [[R:%[0-9]+]](s32) = COPY %s0 ; CHECK: ADJCALLSTACKUP + ; CHECK-NOT: G_FREM %2(s32) = G_FREM %0, %1 ; CHECK: %r0 = COPY [[R]] %r0 = COPY %2(s32) @@ -86,6 +126,7 @@ body: | ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]] %4(s64) = G_MERGE_VALUES %0(s32), %1(s32) %5(s64) = G_MERGE_VALUES %2(s32), %3(s32) + ; CHECK-NOT: G_FREM ; CHECK: ADJCALLSTACKDOWN ; SOFT-DAG: %r{{[0-1]}} = COPY [[X0]] ; SOFT-DAG: %r{{[0-1]}} = COPY [[X1]] @@ -96,6 +137,7 @@ body: | ; SOFT: BLX $fmod, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1 ; HARD: BLX $fmod, {{.*}}, implicit %d0, implicit %d1, implicit-def %d0 ; CHECK: ADJCALLSTACKUP + ; CHECK-NOT: G_FREM %6(s64) = G_FREM %4, %5 %7(s32), %8(s32) = G_UNMERGE_VALUES %6(s64) %r0 = COPY %7(s32) @@ -122,6 +164,7 @@ body: | ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 %0(s32) = COPY %r0 %1(s32) = COPY %r1 + ; CHECK-NOT: G_FPOW ; CHECK: ADJCALLSTACKDOWN ; SOFT-DAG: %r0 = COPY [[X]] ; SOFT-DAG: %r1 = COPY [[Y]] @@ -132,6 +175,7 @@ body: | ; SOFT: [[R:%[0-9]+]](s32) = COPY %r0 ; HARD: [[R:%[0-9]+]](s32) = COPY %s0 ; CHECK: ADJCALLSTACKUP + ; CHECK-NOT: G_FPOW %2(s32) = G_FPOW %0, %1 ; CHECK: %r0 = COPY [[R]] %r0 = COPY %2(s32) @@ -177,6 +221,7 @@ body: | ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]] %4(s64) = G_MERGE_VALUES %0(s32), %1(s32) %5(s64) = G_MERGE_VALUES %2(s32), %3(s32) + ; CHECK-NOT: G_FPOW ; CHECK: ADJCALLSTACKDOWN ; SOFT-DAG: %r{{[0-1]}} = COPY [[X0]] ; SOFT-DAG: %r{{[0-1]}} = COPY [[X1]] @@ -187,6 +232,7 @@ body: | ; SOFT: BLX $pow, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1 ; HARD: BLX $pow, {{.*}}, implicit %d0, implicit %d1, implicit-def %d0 ; CHECK: ADJCALLSTACKUP + ; CHECK-NOT: G_FPOW %6(s64) = G_FPOW %4, %5 %7(s32), %8(s32) = G_UNMERGE_VALUES %6(s64) %r0 = COPY %7(s32) @@ -214,6 +260,7 @@ body: | %0(s32) = COPY %r0 %1(s32) = COPY %r1 ; HARD: [[R:%[0-9]+]](s32) = G_FADD [[X]], [[Y]] + ; SOFT-NOT: G_FADD ; SOFT: ADJCALLSTACKDOWN ; SOFT-DAG: %r0 = COPY [[X]] ; SOFT-DAG: %r1 = COPY [[Y]] @@ -221,6 +268,7 @@ body: | ; SOFT-DEFAULT: BLX $__addsf3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 ; SOFT: [[R:%[0-9]+]](s32) = COPY %r0 ; SOFT: ADJCALLSTACKUP + ; SOFT-NOT: G_FADD %2(s32) = G_FADD %0, %1 ; CHECK: %r0 = COPY [[R]] %r0 = COPY %2(s32) @@ -261,6 +309,7 @@ body: | %4(s64) = G_MERGE_VALUES %0(s32), %1(s32) %5(s64) = G_MERGE_VALUES %2(s32), %3(s32) ; HARD: [[R:%[0-9]+]](s64) = G_FADD [[X]], [[Y]] + ; SOFT-NOT: G_FADD ; SOFT: ADJCALLSTACKDOWN ; SOFT-DAG: %r{{[0-1]}} = COPY [[X0]] ; SOFT-DAG: %r{{[0-1]}} = COPY [[X1]] @@ -269,6 +318,7 @@ body: | ; SOFT-AEABI: BLX $__aeabi_dadd, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1 ; SOFT-DEFAULT: BLX $__adddf3, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1 ; SOFT: ADJCALLSTACKUP + ; SOFT-NOT: G_FADD %6(s64) = G_FADD %4, %5 ; HARD-DAG: G_UNMERGE_VALUES [[R]](s64) %7(s32),%8(s32) = G_UNMERGE_VALUES %6(s64) @@ -276,3 +326,1565 @@ body: | %r1 = COPY %8(s32) BX_RET 14, _, implicit %r0, implicit %r1 ... +--- +name: test_fcmp_true_s32 +# CHECK-LABEL: name: test_fcmp_true_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(true), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(true), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: [[REXT:%[0-9]+]](s32) = G_CONSTANT i32 -1 + ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]](s32) + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_false_s32 +# CHECK-LABEL: name: test_fcmp_false_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(false), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(false), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: [[REXT:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]](s32) + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_oeq_s32 +# CHECK-LABEL: name: test_fcmp_oeq_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(oeq), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(oeq), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__eqsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ogt_s32 +# CHECK-LABEL: name: test_fcmp_ogt_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(ogt), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ogt), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__gtsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_oge_s32 +# CHECK-LABEL: name: test_fcmp_oge_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(oge), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(oge), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmpge, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__gesf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sge), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_olt_s32 +# CHECK-LABEL: name: test_fcmp_olt_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(olt), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(olt), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmplt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__ltsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ole_s32 +# CHECK-LABEL: name: test_fcmp_ole_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(ole), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ole), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmple, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__lesf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sle), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ord_s32 +# CHECK-LABEL: name: test_fcmp_ord_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(ord), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ord), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmpun, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__unordsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ugt_s32 +# CHECK-LABEL: name: test_fcmp_ugt_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(ugt), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ugt), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmple, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__lesf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]] + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_uge_s32 +# CHECK-LABEL: name: test_fcmp_uge_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(uge), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(uge), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmplt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__ltsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]] + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sge), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ult_s32 +# CHECK-LABEL: name: test_fcmp_ult_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(ult), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ult), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmpge, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__gesf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]] + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ule_s32 +# CHECK-LABEL: name: test_fcmp_ule_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(ule), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ule), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__gtsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]] + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sle), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_une_s32 +# CHECK-LABEL: name: test_fcmp_une_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(une), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(une), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__nesf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]] + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_uno_s32 +# CHECK-LABEL: name: test_fcmp_uno_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(uno), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(uno), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmpun, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__unordsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_one_s32 +# CHECK-LABEL: name: test_fcmp_one_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(one), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(one), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__gtsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET1:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R1:%[0-9]+]](s1) = G_TRUNC [[RET1]] + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R1:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET1]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmplt, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__ltsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET2:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R2:%[0-9]+]](s1) = G_TRUNC [[RET2]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R2:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET2]](s32), [[ZERO]] + ; SOFT-DAG: [[R1EXT:%[0-9]+]](s32) = G_ANYEXT [[R1]] + ; SOFT-DAG: [[R2EXT:%[0-9]+]](s32) = G_ANYEXT [[R2]] + ; SOFT: [[REXT:%[0-9]+]](s32) = G_OR [[R1EXT]], [[R2EXT]] + ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]] + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ueq_s32 +# CHECK-LABEL: name: test_fcmp_ueq_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %r0, %r1 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1 + %2(s1) = G_FCMP floatpred(ueq), %0(s32), %1 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ueq), [[X]](s32), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__eqsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET1:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R1:%[0-9]+]](s1) = G_TRUNC [[RET1]] + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R1:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET1]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X]] + ; SOFT-DAG: %r1 = COPY [[Y]] + ; SOFT-AEABI: BLX $__aeabi_fcmpun, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__unordsf2, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0 + ; SOFT: [[RET2:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R2:%[0-9]+]](s1) = G_TRUNC [[RET2]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R2:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET2]](s32), [[ZERO]] + ; SOFT-DAG: [[R1EXT:%[0-9]+]](s32) = G_ANYEXT [[R1]] + ; SOFT-DAG: [[R2EXT:%[0-9]+]](s32) = G_ANYEXT [[R2]] + ; SOFT: [[REXT:%[0-9]+]](s32) = G_OR [[R1EXT]], [[R2EXT]] + ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]] + ; SOFT-NOT: G_FCMP + %3(s32) = G_ZEXT %2(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %3(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_true_s64 +# CHECK-LABEL: name: test_fcmp_true_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(true), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(true), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: [[REXT:%[0-9]+]](s32) = G_CONSTANT i32 -1 + ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]](s32) + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_false_s64 +# CHECK-LABEL: name: test_fcmp_false_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(false), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(false), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: [[REXT:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]](s32) + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_oeq_s64 +# CHECK-LABEL: name: test_fcmp_oeq_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(oeq), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(oeq), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__eqdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ogt_s64 +# CHECK-LABEL: name: test_fcmp_ogt_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(ogt), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ogt), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__gtdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_oge_s64 +# CHECK-LABEL: name: test_fcmp_oge_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(oge), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(oge), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmpge, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__gedf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sge), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_olt_s64 +# CHECK-LABEL: name: test_fcmp_olt_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(olt), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(olt), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmplt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__ltdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ole_s64 +# CHECK-LABEL: name: test_fcmp_ole_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(ole), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ole), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmple, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__ledf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sle), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ord_s64 +# CHECK-LABEL: name: test_fcmp_ord_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(ord), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ord), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmpun, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__unorddf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ugt_s64 +# CHECK-LABEL: name: test_fcmp_ugt_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(ugt), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ugt), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmple, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__ledf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]] + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_uge_s64 +# CHECK-LABEL: name: test_fcmp_uge_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(uge), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(uge), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmplt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__ltdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]] + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sge), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ult_s64 +# CHECK-LABEL: name: test_fcmp_ult_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(ult), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ult), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmpge, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__gedf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]] + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ule_s64 +# CHECK-LABEL: name: test_fcmp_ule_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(ule), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ule), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__gtdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]] + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(sle), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_une_s64 +# CHECK-LABEL: name: test_fcmp_une_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(une), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(une), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__nedf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET]](s32), [[ZERO]] + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_uno_s64 +# CHECK-LABEL: name: test_fcmp_uno_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(uno), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(uno), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmpun, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__unorddf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R:%[0-9]+]](s1) = G_TRUNC [[RET]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_one_s64 +# CHECK-LABEL: name: test_fcmp_one_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(one), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(one), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmpgt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__gtdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET1:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R1:%[0-9]+]](s1) = G_TRUNC [[RET1]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R1:%[0-9]+]](s1) = G_ICMP intpred(sgt), [[RET1]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmplt, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__ltdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET2:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R2:%[0-9]+]](s1) = G_TRUNC [[RET2]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R2:%[0-9]+]](s1) = G_ICMP intpred(slt), [[RET2]](s32), [[ZERO]] + ; SOFT-DAG: [[R1EXT:%[0-9]+]](s32) = G_ANYEXT [[R1]] + ; SOFT-DAG: [[R2EXT:%[0-9]+]](s32) = G_ANYEXT [[R2]] + ; SOFT: [[REXT:%[0-9]+]](s32) = G_OR [[R1EXT]], [[R2EXT]] + ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]] + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... +--- +name: test_fcmp_ueq_s64 +# CHECK-LABEL: name: test_fcmp_ueq_s64 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } + - { id: 6, class: _ } + - { id: 7, class: _ } +body: | + bb.0: + liveins: %r0, %r1, %r2, %r3 + + %0(s32) = COPY %r0 + %1(s32) = COPY %r1 + %2(s32) = COPY %r2 + %3(s32) = COPY %r3 + ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0 + ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1 + ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2 + ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3 + %4(s64) = G_MERGE_VALUES %0(s32), %1 + %5(s64) = G_MERGE_VALUES %2(s32), %3 + ; HARD-DAG: [[X:%[0-9]+]](s64) = G_MERGE_VALUES [[X0]](s32), [[X1]](s32) + ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_MERGE_VALUES [[Y0]](s32), [[Y1]](s32) + %6(s1) = G_FCMP floatpred(ueq), %4(s64), %5 + ; HARD: [[R:%[0-9]+]](s1) = G_FCMP floatpred(ueq), [[X]](s64), [[Y]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmpeq, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__eqdf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET1:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R1:%[0-9]+]](s1) = G_TRUNC [[RET1]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R1:%[0-9]+]](s1) = G_ICMP intpred(eq), [[RET1]](s32), [[ZERO]] + ; SOFT-NOT: G_FCMP + ; SOFT: ADJCALLSTACKDOWN + ; SOFT-DAG: %r0 = COPY [[X0]] + ; SOFT-DAG: %r1 = COPY [[X1]] + ; SOFT-DAG: %r2 = COPY [[Y0]] + ; SOFT-DAG: %r3 = COPY [[Y1]] + ; SOFT-AEABI: BLX $__aeabi_dcmpun, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT-DEFAULT: BLX $__unorddf2, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0 + ; SOFT: [[RET2:%[0-9]+]](s32) = COPY %r0 + ; SOFT: ADJCALLSTACKUP + ; SOFT-AEABI: [[R2:%[0-9]+]](s1) = G_TRUNC [[RET2]](s32) + ; SOFT-DEFAULT: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0 + ; SOFT-DEFAULT: [[R2:%[0-9]+]](s1) = G_ICMP intpred(ne), [[RET2]](s32), [[ZERO]] + ; SOFT-DAG: [[R1EXT:%[0-9]+]](s32) = G_ANYEXT [[R1]] + ; SOFT-DAG: [[R2EXT:%[0-9]+]](s32) = G_ANYEXT [[R2]] + ; SOFT: [[REXT:%[0-9]+]](s32) = G_OR [[R1EXT]], [[R2EXT]] + ; SOFT: [[R:%[0-9]+]](s1) = G_TRUNC [[REXT]] + ; SOFT-NOT: G_FCMP + %7(s32) = G_ZEXT %6(s1) + ; CHECK: [[REXT:%[0-9]+]](s32) = G_ZEXT [[R]](s1) + %r0 = COPY %7(s32) + ; CHECK: %r0 = COPY [[REXT]] + BX_RET 14, _, implicit %r0 +... diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir index bf759728c3658..4575341dfc290 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir @@ -111,6 +111,7 @@ body: | %1(s8) = COPY %r1 %2(s8) = G_ADD %0, %1 ; G_ADD with s8 should widen + ; CHECK-NOT: {{%[0-9]+}}(s8) = G_ADD {{%[0-9]+, %[0-9]+}} ; CHECK: {{%[0-9]+}}(s32) = G_ADD {{%[0-9]+, %[0-9]+}} ; CHECK-NOT: {{%[0-9]+}}(s8) = G_ADD {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s8) @@ -136,6 +137,7 @@ body: | %1(s16) = COPY %r1 %2(s16) = G_ADD %0, %1 ; G_ADD with s16 should widen + ; CHECK-NOT: {{%[0-9]+}}(s16) = G_ADD {{%[0-9]+, %[0-9]+}} ; CHECK: {{%[0-9]+}}(s32) = G_ADD {{%[0-9]+, %[0-9]+}} ; CHECK-NOT: {{%[0-9]+}}(s16) = G_ADD {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s16) @@ -187,6 +189,7 @@ body: | %1(s8) = COPY %r1 %2(s8) = G_SUB %0, %1 ; G_SUB with s8 should widen + ; CHECK-NOT: {{%[0-9]+}}(s8) = G_SUB {{%[0-9]+, %[0-9]+}} ; CHECK: {{%[0-9]+}}(s32) = G_SUB {{%[0-9]+, %[0-9]+}} ; CHECK-NOT: {{%[0-9]+}}(s8) = G_SUB {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s8) @@ -212,6 +215,7 @@ body: | %1(s16) = COPY %r1 %2(s16) = G_SUB %0, %1 ; G_SUB with s16 should widen + ; CHECK-NOT: {{%[0-9]+}}(s16) = G_SUB {{%[0-9]+, %[0-9]+}} ; CHECK: {{%[0-9]+}}(s32) = G_SUB {{%[0-9]+, %[0-9]+}} ; CHECK-NOT: {{%[0-9]+}}(s16) = G_SUB {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s16) @@ -263,6 +267,7 @@ body: | %1(s8) = COPY %r1 %2(s8) = G_MUL %0, %1 ; G_MUL with s8 should widen + ; CHECK-NOT: {{%[0-9]+}}(s8) = G_MUL {{%[0-9]+, %[0-9]+}} ; CHECK: {{%[0-9]+}}(s32) = G_MUL {{%[0-9]+, %[0-9]+}} ; CHECK-NOT: {{%[0-9]+}}(s8) = G_MUL {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s8) @@ -288,6 +293,7 @@ body: | %1(s16) = COPY %r1 %2(s16) = G_MUL %0, %1 ; G_MUL with s16 should widen + ; CHECK-NOT: {{%[0-9]+}}(s16) = G_MUL {{%[0-9]+, %[0-9]+}} ; CHECK: {{%[0-9]+}}(s32) = G_MUL {{%[0-9]+, %[0-9]+}} ; CHECK-NOT: {{%[0-9]+}}(s16) = G_MUL {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s16) @@ -339,6 +345,7 @@ body: | %1(s8) = COPY %r1 %2(s8) = G_AND %0, %1 ; G_AND with s8 should widen + ; CHECK-NOT: {{%[0-9]+}}(s8) = G_AND {{%[0-9]+, %[0-9]+}} ; CHECK: {{%[0-9]+}}(s32) = G_AND {{%[0-9]+, %[0-9]+}} ; CHECK-NOT: {{%[0-9]+}}(s8) = G_AND {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s8) @@ -364,6 +371,7 @@ body: | %1(s16) = COPY %r1 %2(s16) = G_AND %0, %1 ; G_AND with s16 should widen + ; CHECK-NOT: {{%[0-9]+}}(s16) = G_AND {{%[0-9]+, %[0-9]+}} ; CHECK: {{%[0-9]+}}(s32) = G_AND {{%[0-9]+, %[0-9]+}} ; CHECK-NOT: {{%[0-9]+}}(s16) = G_AND {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s16) @@ -415,6 +423,7 @@ body: | %1(s8) = COPY %r1 %2(s8) = G_OR %0, %1 ; G_OR with s8 should widen + ; CHECK-NOT: {{%[0-9]+}}(s8) = G_OR {{%[0-9]+, %[0-9]+}} ; CHECK: {{%[0-9]+}}(s32) = G_OR {{%[0-9]+, %[0-9]+}} ; CHECK-NOT: {{%[0-9]+}}(s8) = G_OR {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s8) @@ -440,6 +449,7 @@ body: | %1(s16) = COPY %r1 %2(s16) = G_OR %0, %1 ; G_OR with s16 should widen + ; CHECK-NOT: {{%[0-9]+}}(s16) = G_OR {{%[0-9]+, %[0-9]+}} ; CHECK: {{%[0-9]+}}(s32) = G_OR {{%[0-9]+, %[0-9]+}} ; CHECK-NOT: {{%[0-9]+}}(s16) = G_OR {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s16) @@ -491,6 +501,7 @@ body: | %1(s8) = COPY %r1 %2(s8) = G_XOR %0, %1 ; G_XOR with s8 should widen + ; CHECK-NOT: {{%[0-9]+}}(s8) = G_XOR {{%[0-9]+, %[0-9]+}} ; CHECK: {{%[0-9]+}}(s32) = G_XOR {{%[0-9]+, %[0-9]+}} ; CHECK-NOT: {{%[0-9]+}}(s8) = G_XOR {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s8) @@ -516,6 +527,7 @@ body: | %1(s16) = COPY %r1 %2(s16) = G_XOR %0, %1 ; G_XOR with s16 should widen + ; CHECK-NOT: {{%[0-9]+}}(s16) = G_XOR {{%[0-9]+, %[0-9]+}} ; CHECK: {{%[0-9]+}}(s32) = G_XOR {{%[0-9]+, %[0-9]+}} ; CHECK-NOT: {{%[0-9]+}}(s16) = G_XOR {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s16) @@ -689,11 +701,32 @@ selected: false tracksRegLiveness: true registers: - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } body: | bb.0: %0(s32) = G_CONSTANT 42 ; CHECK: {{%[0-9]+}}(s32) = G_CONSTANT 42 + %1(s16) = G_CONSTANT i16 21 + ; CHECK-NOT: G_CONSTANT i16 + ; CHECK: [[EXT:%[0-9]+]](s32) = G_CONSTANT i32 21 + ; CHECK: {{%[0-9]+}}(s16) = G_TRUNC [[EXT]](s32) + ; CHECK-NOT: G_CONSTANT i16 + + %2(s8) = G_CONSTANT i8 10 + ; CHECK-NOT: G_CONSTANT i8 + ; CHECK: [[EXT:%[0-9]+]](s32) = G_CONSTANT i32 10 + ; CHECK: {{%[0-9]+}}(s8) = G_TRUNC [[EXT]](s32) + ; CHECK-NOT: G_CONSTANT i8 + + %3(s1) = G_CONSTANT i1 1 + ; CHECK-NOT: G_CONSTANT i1 + ; CHECK: [[EXT:%[0-9]+]](s32) = G_CONSTANT i32 -1 + ; CHECK: {{%[0-9]+}}(s1) = G_TRUNC [[EXT]](s32) + ; CHECK-NOT: G_CONSTANT i1 + %r0 = COPY %0(s32) BX_RET 14, _, implicit %r0 ... diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir index d3b93e488ef47..ffca431d96ea1 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir @@ -35,6 +35,8 @@ define void @test_trunc_s32_16() { ret void } define void @test_icmp_eq_s32() { ret void } + define void @test_fcmp_one_s32() #0 { ret void } + define void @test_fcmp_ugt_s64() #0 { ret void } define void @test_select_s32() { ret void } @@ -743,6 +745,62 @@ body: | ... --- +name: test_fcmp_one_s32 +# CHECK-LABEL: name: test_fcmp_one_s32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: fprb, preferred-register: '' } +# CHECK: - { id: 1, class: fprb, preferred-register: '' } +# CHECK: - { id: 2, class: gprb, preferred-register: '' } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %s0, %s1 + + %0(s32) = COPY %s0 + %1(s32) = COPY %s1 + %2(s1) = G_FCMP floatpred(one), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %r0 = COPY %3(s32) + BX_RET 14, _, implicit %r0 + +... +--- +name: test_fcmp_ugt_s64 +# CHECK-LABEL: name: test_fcmp_ugt_s64 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: fprb, preferred-register: '' } +# CHECK: - { id: 1, class: fprb, preferred-register: '' } +# CHECK: - { id: 2, class: gprb, preferred-register: '' } + +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.0: + liveins: %d0, %d1 + + %0(s64) = COPY %d0 + %1(s64) = COPY %d1 + %2(s1) = G_FCMP floatpred(ugt), %0(s64), %1 + %3(s32) = G_ZEXT %2(s1) + %r0 = COPY %3(s32) + BX_RET 14, _, implicit %r0 + +... +--- name: test_select_s32 # CHECK-LABEL: name: test_select_s32 legalized: true diff --git a/test/CodeGen/ARM/arguments-nosplit-double.ll b/test/CodeGen/ARM/arguments-nosplit-double.ll index 8e4dee45ddf27..bb3710842d34c 100644 --- a/test/CodeGen/ARM/arguments-nosplit-double.ll +++ b/test/CodeGen/ARM/arguments-nosplit-double.ll @@ -8,5 +8,6 @@ define i32 @f(i64 %z, i32 %a, double %b) { ret i32 %tmp } +; CHECK-LABEL: f: ; CHECK-NOT: r3 diff --git a/test/CodeGen/ARM/arguments-nosplit-i64.ll b/test/CodeGen/ARM/arguments-nosplit-i64.ll index 4a08d0a0406ac..02bdc6cc227a0 100644 --- a/test/CodeGen/ARM/arguments-nosplit-i64.ll +++ b/test/CodeGen/ARM/arguments-nosplit-i64.ll @@ -8,5 +8,6 @@ define i32 @f(i64 %z, i32 %a, i64 %b) { ret i32 %tmp } +; CHECK-LABEL: f: ; CHECK-NOT: r3 diff --git a/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll index d54848a6bcf19..0ae2d5f6f2f2b 100644 --- a/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll +++ b/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll @@ -13,13 +13,13 @@ ; CHECK: rdefs left ; CHECK-NEXT: Latency : 4 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; CHECK-SAME: Latency=1 -; CHECK-NEXT: data +; CHECK-NEXT: Data ; CHECK-SAME: Latency=3 -; CHECK-NEXT: data +; CHECK-NEXT: Data ; CHECK-SAME: Latency=3 -; CHECK-NEXT: data +; CHECK-NEXT: Data ; CHECK-SAME: Latency=4 define i32 @bar(i32 %a1, i32 %b1, i32 %c1) minsize optsize { %1 = load i32, i32* @a, align 4 diff --git a/test/CodeGen/ARM/cortex-a57-misched-ldm.ll b/test/CodeGen/ARM/cortex-a57-misched-ldm.ll index 9cb076651f5b3..bc7a14b1028ef 100644 --- a/test/CodeGen/ARM/cortex-a57-misched-ldm.ll +++ b/test/CodeGen/ARM/cortex-a57-misched-ldm.ll @@ -8,9 +8,9 @@ ; CHECK: rdefs left ; CHECK-NEXT: Latency : 3 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; CHECK-SAME: Latency=3 -; CHECK-NEXT: data +; CHECK-NEXT: Data ; CHECK-SAME: Latency=3 define i32 @foo(i32* %a) nounwind optsize { diff --git a/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll index 774b0a907e399..67cddc14d0475 100644 --- a/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll +++ b/test/CodeGen/ARM/cortex-a57-misched-stm-wrback.ll @@ -10,7 +10,7 @@ ; CHECK: rdefs left ; CHECK-NEXT: Latency : 2 ; CHECK: Successors -; CHECK: data +; CHECK: Data ; CHECK-SAME: Latency=1 define i32 @bar(i32 %v0, i32 %v1, i32 %v2, i32* %addr) { diff --git a/test/CodeGen/ARM/cortex-a57-misched-vfma.ll b/test/CodeGen/ARM/cortex-a57-misched-vfma.ll index e234e179ed071..372b2e2f5dc99 100644 --- a/test/CodeGen/ARM/cortex-a57-misched-vfma.ll +++ b/test/CodeGen/ARM/cortex-a57-misched-vfma.ll @@ -11,7 +11,7 @@ define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float ; > VMULS common latency = 5 ; CHECK: Latency : 5 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; > VMULS read-advanced latency to VMLAS = 0 ; CHECK-SAME: Latency=0 @@ -20,7 +20,7 @@ define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float ; > VMLAS common latency = 9 ; CHECK: Latency : 9 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; > VMLAS read-advanced latency to the next VMLAS = 4 ; CHECK-SAME: Latency=4 @@ -28,7 +28,7 @@ define float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float ; CHECK-FAST: VFMAS ; CHECK: Latency : 9 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; > VMLAS not-optimized latency to VMOVRS = 9 ; CHECK-SAME: Latency=9 @@ -50,7 +50,7 @@ define <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 ; > VMULfd common latency = 5 ; CHECK: Latency : 5 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; VMULfd read-advanced latency to VMLAfd = 0 ; CHECK-SAME: Latency=0 @@ -59,7 +59,7 @@ define <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 ; > VMLAfd common latency = 9 ; CHECK: Latency : 9 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; > VMLAfd read-advanced latency to the next VMLAfd = 4 ; CHECK-SAME: Latency=4 @@ -67,7 +67,7 @@ define <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 ; CHECK-FAST: VFMAfd ; CHECK: Latency : 9 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; > VMLAfd not-optimized latency to VMOVRRD = 9 ; CHECK-SAME: Latency=9 @@ -88,7 +88,7 @@ define float @Test3(float %f1, float %f2, float %f3, float %f4, float %f5, float ; > VMULS common latency = 5 ; CHECK: Latency : 5 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; > VMULS read-advanced latency to VMLSS = 0 ; CHECK-SAME: Latency=0 @@ -97,7 +97,7 @@ define float @Test3(float %f1, float %f2, float %f3, float %f4, float %f5, float ; > VMLSS common latency = 9 ; CHECK: Latency : 9 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; > VMLSS read-advanced latency to the next VMLSS = 4 ; CHECK-SAME: Latency=4 @@ -105,7 +105,7 @@ define float @Test3(float %f1, float %f2, float %f3, float %f4, float %f5, float ; CHECK-FAST: VFMSS ; CHECK: Latency : 9 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; > VMLSS not-optimized latency to VMOVRS = 9 ; CHECK-SAME: Latency=9 @@ -127,7 +127,7 @@ define <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 ; > VMULfd common latency = 5 ; CHECK: Latency : 5 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; VMULfd read-advanced latency to VMLSfd = 0 ; CHECK-SAME: Latency=0 @@ -136,7 +136,7 @@ define <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 ; > VMLSfd common latency = 9 ; CHECK: Latency : 9 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; > VMLSfd read-advanced latency to the next VMLSfd = 4 ; CHECK-SAME: Latency=4 @@ -144,7 +144,7 @@ define <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 ; CHECK-FAST: VFMSfd ; CHECK: Latency : 9 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; > VMLSfd not-optimized latency to VMOVRRD = 9 ; CHECK-SAME: Latency=9 @@ -165,7 +165,7 @@ define float @Test5(float %f1, float %f2, float %f3) { ; CHECK-FAST: VFNMS ; CHECK: Latency : 9 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; > VMLAS not-optimized latency to VMOVRS = 9 ; CHECK-SAME: Latency=9 @@ -184,7 +184,7 @@ define float @Test6(float %f1, float %f2, float %f3) { ; CHECK-FAST: VFNMA ; CHECK: Latency : 9 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; > VMLAS not-optimized latency to VMOVRS = 9 ; CHECK-SAME: Latency=9 diff --git a/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll index 6cfa823fb9694..b5edcc3042293 100644 --- a/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll +++ b/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll @@ -13,15 +13,15 @@ ; CHECK: rdefs left ; CHECK-NEXT: Latency : 6 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; CHECK-SAME: Latency=1 -; CHECK-NEXT: data +; CHECK-NEXT: Data ; CHECK-SAME: Latency=1 -; CHECK-NEXT: data +; CHECK-NEXT: Data ; CHECK-SAME: Latency=5 -; CHECK-NEXT: data +; CHECK-NEXT: Data ; CHECK-SAME: Latency=5 -; CHECK-NEXT: data +; CHECK-NEXT: Data ; CHECK-SAME: Latency=6 define i32 @bar(i32* %iptr) minsize optsize { %1 = load double, double* @a, align 8 diff --git a/test/CodeGen/ARM/cortex-a57-misched-vldm.ll b/test/CodeGen/ARM/cortex-a57-misched-vldm.ll index 218b5b41a7e43..12c7b3270c3b3 100644 --- a/test/CodeGen/ARM/cortex-a57-misched-vldm.ll +++ b/test/CodeGen/ARM/cortex-a57-misched-vldm.ll @@ -8,11 +8,11 @@ ; CHECK: rdefs left ; CHECK-NEXT: Latency : 6 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; CHECK-SAME: Latency=5 -; CHECK-NEXT: data +; CHECK-NEXT: Data ; CHECK-SAME: Latency=5 -; CHECK-NEXT: data +; CHECK-NEXT: Data ; CHECK-SAME: Latency=6 define double @foo(double* %a) nounwind optsize { diff --git a/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll index af1c469d44432..05c498eee49f7 100644 --- a/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll +++ b/test/CodeGen/ARM/cortex-a57-misched-vstm-wrback.ll @@ -9,7 +9,7 @@ ; CHECK: rdefs left ; CHECK-NEXT: Latency : 4 ; CHECK: Successors: -; CHECK: data +; CHECK: Data ; CHECK-SAME: Latency=1 @a = global double 0.0, align 4 diff --git a/test/CodeGen/ARM/fence-singlethread.ll b/test/CodeGen/ARM/fence-singlethread.ll index ec032ccac423c..536b6cc7c9d01 100644 --- a/test/CodeGen/ARM/fence-singlethread.ll +++ b/test/CodeGen/ARM/fence-singlethread.ll @@ -11,6 +11,6 @@ define void @fence_singlethread() { ; CHECK: @ COMPILER BARRIER ; CHECK-NOT: dmb - fence singlethread seq_cst + fence syncscope("singlethread") seq_cst ret void } diff --git a/test/CodeGen/ARM/ror.ll b/test/CodeGen/ARM/ror.ll new file mode 100644 index 0000000000000..0f699a8dd29d6 --- /dev/null +++ b/test/CodeGen/ARM/ror.ll @@ -0,0 +1,33 @@ +; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o - | FileCheck %s + +; rotr (rotr x, 4), 6 -> rotr x, 10 -> ror r0, r0, #10 +define i32 @test1(i32 %x) nounwind readnone { +; CHECK-LABEL: test1: +; CHECK: ror r0, r0, #10 +; CHECK: bx lr +entry: + %high_part.i = shl i32 %x, 28 + %low_part.i = lshr i32 %x, 4 + %result.i = or i32 %high_part.i, %low_part.i + %high_part.i.1 = shl i32 %result.i, 26 + %low_part.i.2 = lshr i32 %result.i, 6 + %result.i.3 = or i32 %low_part.i.2, %high_part.i.1 + ret i32 %result.i.3 +} + +; the same vector test +define <2 x i32> @test2(<2 x i32> %x) nounwind readnone { +; CHECK-LABEL: test2: +; CHECK: ror r0, r0, #10 +; CHECK: ror r1, r1, #10 +; CHECK: bx lr +entry: + %high_part.i = shl <2 x i32> %x, <i32 28, i32 28> + %low_part.i = lshr <2 x i32> %x, <i32 4, i32 4> + %result.i = or <2 x i32> %high_part.i, %low_part.i + %high_part.i.1 = shl <2 x i32> %result.i, <i32 26, i32 26> + %low_part.i.2 = lshr <2 x i32> %result.i, <i32 6, i32 6> + %result.i.3 = or <2 x i32> %low_part.i.2, %high_part.i.1 + ret <2 x i32> %result.i.3 +} + diff --git a/test/CodeGen/ARM/scavenging.mir b/test/CodeGen/ARM/scavenging.mir new file mode 100644 index 0000000000000..09040a3bd217e --- /dev/null +++ b/test/CodeGen/ARM/scavenging.mir @@ -0,0 +1,66 @@ +# RUN: llc -o - %s -mtriple=arm-arm-none-eabi -mcpu=cortex-m0 -run-pass scavenger-test | FileCheck %s +--- +# CHECK-LABEL: name: scavengebug0 +# Make sure we are not spilling/using a physreg used in the very last +# instruction of the scavenging range. +# CHECK-NOT: tSTRi {{.*}}%r0,{{.*}}%r0 +# CHECK-NOT: tSTRi {{.*}}%r1,{{.*}}%r1 +# CHECK-NOT: tSTRi {{.*}}%r2,{{.*}}%r2 +# CHECK-NOT: tSTRi {{.*}}%r3,{{.*}}%r3 +# CHECK-NOT: tSTRi {{.*}}%r4,{{.*}}%r4 +# CHECK-NOT: tSTRi {{.*}}%r5,{{.*}}%r5 +# CHECK-NOT: tSTRi {{.*}}%r6,{{.*}}%r6 +# CHECK-NOT: tSTRi {{.*}}%r7,{{.*}}%r7 +name: scavengebug0 +body: | + bb.0: + ; Bring up register pressure to force emergency spilling + %r0 = IMPLICIT_DEF + %r1 = IMPLICIT_DEF + %r2 = IMPLICIT_DEF + %r3 = IMPLICIT_DEF + %r4 = IMPLICIT_DEF + %r5 = IMPLICIT_DEF + %r6 = IMPLICIT_DEF + %r7 = IMPLICIT_DEF + + %0 : tgpr = IMPLICIT_DEF + %0 = tADDhirr %0, %sp, 14, _ + tSTRi %r0, %0, 0, 14, _ + + %1 : tgpr = IMPLICIT_DEF + %1 = tADDhirr %1, %sp, 14, _ + tSTRi %r1, %1, 0, 14, _ + + %2 : tgpr = IMPLICIT_DEF + %2 = tADDhirr %2, %sp, 14, _ + tSTRi %r2, %2, 0, 14, _ + + %3 : tgpr = IMPLICIT_DEF + %3 = tADDhirr %3, %sp, 14, _ + tSTRi %r3, %3, 0, 14, _ + + %4 : tgpr = IMPLICIT_DEF + %4 = tADDhirr %4, %sp, 14, _ + tSTRi %r4, %4, 0, 14, _ + + %5 : tgpr = IMPLICIT_DEF + %5 = tADDhirr %5, %sp, 14, _ + tSTRi %r5, %5, 0, 14, _ + + %6 : tgpr = IMPLICIT_DEF + %6 = tADDhirr %6, %sp, 14, _ + tSTRi %r6, %6, 0, 14, _ + + %7 : tgpr = IMPLICIT_DEF + %7 = tADDhirr %7, %sp, 14, _ + tSTRi %r7, %7, 0, 14, _ + + KILL %r0 + KILL %r1 + KILL %r2 + KILL %r3 + KILL %r4 + KILL %r5 + KILL %r6 + KILL %r7 diff --git a/test/CodeGen/AVR/branch-relaxation.ll b/test/CodeGen/AVR/branch-relaxation.ll new file mode 100644 index 0000000000000..d6f07f6535763 --- /dev/null +++ b/test/CodeGen/AVR/branch-relaxation.ll @@ -0,0 +1,96 @@ +; RUN: llc < %s -march=avr | FileCheck %s + +; CHECKC-LABEL: relax_breq +; CHECK: cpi r{{[0-9]+}}, 0 +; CHECK: brne LBB0_1 +; CHECK: rjmp LBB0_2 +; LBB0_1: + +define i8 @relax_breq(i1 %a) { +entry-block: + br i1 %a, label %hello, label %finished + +hello: + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + br label %finished +finished: + ret i8 3 +} + +; CHECKC-LABEL: no_relax_breq +; CHECK: cpi r{{[0-9]+}}, 0 +; CHECK: breq [[END_BB:LBB[0-9]+_[0-9]+]] +; CHECK: nop +; ... +; LBB0_1: +define i8 @no_relax_breq(i1 %a) { +entry-block: + br i1 %a, label %hello, label %finished + +hello: + ; There are not enough NOPs to require relaxation. + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + call void asm sideeffect "nop", ""() + br label %finished +finished: + ret i8 3 +} + diff --git a/test/CodeGen/AVR/ctlz.ll b/test/CodeGen/AVR/ctlz.ll index 4f73e846b1f19..8659550baf908 100644 --- a/test/CodeGen/AVR/ctlz.ll +++ b/test/CodeGen/AVR/ctlz.ll @@ -10,7 +10,8 @@ declare i8 @llvm.ctlz.i8(i8) ; CHECK-LABEL: count_leading_zeros: ; CHECK: cpi [[RESULT:r[0-9]+]], 0 -; CHECK: breq LBB0_1 +; CHECK: brne LBB0_1 +; CHECK: rjmp LBB0_2 ; CHECK: mov [[SCRATCH:r[0-9]+]], {{.*}}[[RESULT]] ; CHECK: lsr {{.*}}[[SCRATCH]] ; CHECK: or {{.*}}[[SCRATCH]], {{.*}}[[RESULT]] @@ -43,6 +44,6 @@ declare i8 @llvm.ctlz.i8(i8) ; CHECK: add {{.*}}[[RESULT]], {{.*}}[[SCRATCH]] ; CHECK: andi {{.*}}[[RESULT]], 15 ; CHECK: ret -; CHECK: LBB0_1: +; CHECK: LBB0_2: ; CHECK: ldi {{.*}}[[RESULT]], 8 ; CHECK: ret diff --git a/test/CodeGen/AVR/cttz.ll b/test/CodeGen/AVR/cttz.ll index 2501566275ea0..02d36954f5264 100644 --- a/test/CodeGen/AVR/cttz.ll +++ b/test/CodeGen/AVR/cttz.ll @@ -10,7 +10,7 @@ declare i8 @llvm.cttz.i8(i8) ; CHECK-LABEL: count_trailing_zeros: ; CHECK: cpi [[RESULT:r[0-9]+]], 0 -; CHECK: breq LBB0_1 +; CHECK: breq [[END_BB:LBB[0-9]+_[0-9]+]] ; CHECK: mov [[SCRATCH:r[0-9]+]], {{.*}}[[RESULT]] ; CHECK: dec {{.*}}[[SCRATCH]] ; CHECK: com {{.*}}[[RESULT]] @@ -34,7 +34,7 @@ declare i8 @llvm.cttz.i8(i8) ; CHECK: andi {{.*}}[[SCRATCH]], 15 ; CHECK: mov {{.*}}[[RESULT]], {{.*}}[[SCRATCH]] ; CHECK: ret -; CHECK: LBB0_1: +; CHECK: [[END_BB]]: ; CHECK: ldi {{.*}}[[SCRATCH]], 8 ; CHECK: mov {{.*}}[[RESULT]], {{.*}}[[SCRATCH]] ; CHECK: ret diff --git a/test/CodeGen/AVR/frmidx-iterator-bug.ll b/test/CodeGen/AVR/frmidx-iterator-bug.ll new file mode 100644 index 0000000000000..f9e2f0688fafb --- /dev/null +++ b/test/CodeGen/AVR/frmidx-iterator-bug.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s -march=avr -mattr=avr6 | FileCheck %s + +%str_slice = type { i8*, i16 } +%Machine = type { i16, [0 x i8], i16, [0 x i8], [16 x i8], [0 x i8] } + +; CHECK-LABEL: step +define void @step(%Machine*) { + ret void +} + +; CHECK-LABEL: main +define void @main() { +start: + %machine = alloca %Machine, align 8 + %v0 = bitcast %Machine* %machine to i8* + %v1 = getelementptr inbounds %Machine, %Machine* %machine, i16 0, i32 2 + %v2 = load i16, i16* %v1, align 2 + br label %bb2.i5 + +bb2.i5: + %v18 = load volatile i8, i8* inttoptr (i16 77 to i8*), align 1 + %v19 = icmp sgt i8 %v18, -1 + br i1 %v19, label %bb2.i5, label %bb.exit6 + +bb.exit6: + %v20 = load volatile i8, i8* inttoptr (i16 78 to i8*), align 2 + br label %bb7 + +bb7: + call void @step(%Machine* %machine) + br label %bb7 +} + diff --git a/test/CodeGen/AVR/icall-func-pointer-correct-addr-space.ll b/test/CodeGen/AVR/icall-func-pointer-correct-addr-space.ll new file mode 100644 index 0000000000000..17ac29e2cdb86 --- /dev/null +++ b/test/CodeGen/AVR/icall-func-pointer-correct-addr-space.ll @@ -0,0 +1,15 @@ +; RUN: llc -mattr=lpm,lpmw < %s -march=avr | FileCheck %s + +declare void @callback(i16 zeroext) + +; CHECK-LABEL: foo +define void @foo() { +entry: + ; CHECK: ldi r{{[0-9]+}}, pm_lo8(callback) + ; CHECK-NEXT: ldi r{{[0-9]+}}, pm_hi8(callback) + call void @bar(i8 zeroext undef, void (i16)* @callback) + ret void +} + +declare void @bar(i8 zeroext, void (i16)*) + diff --git a/test/CodeGen/AVR/pseudo/ANDIWRdK.mir b/test/CodeGen/AVR/pseudo/ANDIWRdK.mir index bcea4e6dfe271..4d58c85f4f232 100644 --- a/test/CodeGen/AVR/pseudo/ANDIWRdK.mir +++ b/test/CodeGen/AVR/pseudo/ANDIWRdK.mir @@ -17,8 +17,8 @@ body: | ; CHECK-LABEL: test_andiwrdrr - ; CHECK: %r20 = ANDIRdK %r20, 175, implicit-def dead %sreg - ; CHECK-NEXT: %r21 = ANDIRdK %r21, 250, implicit-def %sreg + ; CHECK: %r16 = ANDIRdK %r16, 175, implicit-def dead %sreg + ; CHECK-NEXT: %r17 = ANDIRdK %r17, 250, implicit-def %sreg - %r21r20 = ANDIWRdK %r17r16, 64175, implicit-def %sreg + %r17r16 = ANDIWRdK %r17r16, 64175, implicit-def %sreg ... diff --git a/test/CodeGen/AVR/pseudo/COMWRd.mir b/test/CodeGen/AVR/pseudo/COMWRd.mir index 58ff7af7cb3c6..db68a4082b735 100644 --- a/test/CodeGen/AVR/pseudo/COMWRd.mir +++ b/test/CodeGen/AVR/pseudo/COMWRd.mir @@ -20,5 +20,5 @@ body: | ; CHECK: %r14 = COMRd %r14, implicit-def dead %sreg ; CHECK-NEXT: %r15 = COMRd %r15, implicit-def %sreg - %r15r14 = COMWRd %r9r8, implicit-def %sreg + %r15r14 = COMWRd %r15r14, implicit-def %sreg ... diff --git a/test/CodeGen/AVR/pseudo/ORIWRdK.mir b/test/CodeGen/AVR/pseudo/ORIWRdK.mir index d77a6ba884881..eaa12842df428 100644 --- a/test/CodeGen/AVR/pseudo/ORIWRdK.mir +++ b/test/CodeGen/AVR/pseudo/ORIWRdK.mir @@ -20,5 +20,5 @@ body: | ; CHECK: %r20 = ORIRdK %r20, 175, implicit-def dead %sreg ; CHECK-NEXT: %r21 = ORIRdK %r21, 250, implicit-def %sreg - %r21r20 = ORIWRdK %r17r16, 64175, implicit-def %sreg + %r21r20 = ORIWRdK %r21r20, 64175, implicit-def %sreg ... diff --git a/test/CodeGen/AVR/pseudo/SBCIWRdK.mir b/test/CodeGen/AVR/pseudo/SBCIWRdK.mir index 644e6106ee790..a92f6951798bf 100644 --- a/test/CodeGen/AVR/pseudo/SBCIWRdK.mir +++ b/test/CodeGen/AVR/pseudo/SBCIWRdK.mir @@ -20,5 +20,5 @@ body: | ; CHECK: %r20 = SBCIRdK %r20, 175, implicit-def %sreg, implicit killed %sreg ; CHECK-NEXT: %r21 = SBCIRdK %r21, 250, implicit-def %sreg, implicit killed %sreg - %r21r20 = SBCIWRdK %r17r16, 64175, implicit-def %sreg, implicit %sreg + %r21r20 = SBCIWRdK %r21r20, 64175, implicit-def %sreg, implicit %sreg ... diff --git a/test/CodeGen/AVR/pseudo/SUBIWRdK.mir b/test/CodeGen/AVR/pseudo/SUBIWRdK.mir index c7d88d7ab3f68..38ff880a51720 100644 --- a/test/CodeGen/AVR/pseudo/SUBIWRdK.mir +++ b/test/CodeGen/AVR/pseudo/SUBIWRdK.mir @@ -20,5 +20,5 @@ body: | ; CHECK: %r20 = SUBIRdK %r20, 175, implicit-def %sreg ; CHECK-NEXT: %r21 = SBCIRdK %r21, 250, implicit-def %sreg, implicit killed %sreg - %r21r20 = SUBIWRdK %r17r16, 64175, implicit-def %sreg + %r21r20 = SUBIWRdK %r21r20, 64175, implicit-def %sreg ... diff --git a/test/CodeGen/AVR/select-mbb-placement-bug.ll b/test/CodeGen/AVR/select-mbb-placement-bug.ll index ca7ec1ab831ce..aca9502b5dfb7 100644 --- a/test/CodeGen/AVR/select-mbb-placement-bug.ll +++ b/test/CodeGen/AVR/select-mbb-placement-bug.ll @@ -8,9 +8,9 @@ define internal fastcc void @loopy() { ; ; https://github.com/avr-rust/rust/issues/49 -; CHECK: LBB0_1: -; CHECK: LBB0_2: -; CHECK-NOT: LBB0_3: +; CHECK: LBB0_{{[0-9]+}}: +; CHECK: LBB0_{{[0-9]+}}: +; CHECK-NOT: LBB0_{{[0-9]+}}: start: br label %bb7.preheader diff --git a/test/CodeGen/BPF/undef.ll b/test/CodeGen/BPF/undef.ll index de14bfde1ab97..8d8a5f429514f 100644 --- a/test/CodeGen/BPF/undef.ll +++ b/test/CodeGen/BPF/undef.ll @@ -1,4 +1,5 @@ -; RUN: not llc < %s -march=bpf | FileCheck %s +; RUN: not llc < %s -march=bpfel | FileCheck -check-prefixes=CHECK,EL %s +; RUN: not llc < %s -march=bpfeb | FileCheck -check-prefixes=CHECK,EB %s %struct.bpf_map_def = type { i32, i32, i32, i32 } %struct.__sk_buff = type opaque @@ -13,36 +14,31 @@ ; Function Attrs: nounwind uwtable define i32 @ebpf_filter(%struct.__sk_buff* nocapture readnone %ebpf_packet) #0 section "socket1" { -; CHECK: r2 = r10 -; CHECK: r2 += -2 -; CHECK: r1 = 0 -; CHECK: *(u16 *)(r2 + 6) = r1 -; CHECK: *(u16 *)(r2 + 4) = r1 -; CHECK: *(u16 *)(r2 + 2) = r1 -; CHECK: r2 = 6 -; CHECK: *(u8 *)(r10 - 7) = r2 -; CHECK: r2 = 5 -; CHECK: *(u8 *)(r10 - 8) = r2 -; CHECK: r2 = 7 -; CHECK: *(u8 *)(r10 - 6) = r2 -; CHECK: r2 = 8 -; CHECK: *(u8 *)(r10 - 5) = r2 -; CHECK: r2 = 9 -; CHECK: *(u8 *)(r10 - 4) = r2 -; CHECK: r2 = 10 -; CHECK: *(u8 *)(r10 - 3) = r2 -; CHECK: *(u16 *)(r10 + 24) = r1 -; CHECK: *(u16 *)(r10 + 22) = r1 -; CHECK: *(u16 *)(r10 + 20) = r1 -; CHECK: *(u16 *)(r10 + 18) = r1 -; CHECK: *(u16 *)(r10 + 16) = r1 -; CHECK: *(u16 *)(r10 + 14) = r1 -; CHECK: *(u16 *)(r10 + 12) = r1 -; CHECK: *(u16 *)(r10 + 10) = r1 -; CHECK: *(u16 *)(r10 + 8) = r1 -; CHECK: *(u16 *)(r10 + 6) = r1 -; CHECK: *(u16 *)(r10 - 2) = r1 -; CHECK: *(u16 *)(r10 + 26) = r1 +; CHECK: r1 = r10 +; CHECK: r1 += -2 +; CHECK: r2 = 0 +; CHECK: *(u16 *)(r1 + 6) = r2 +; CHECK: *(u16 *)(r1 + 4) = r2 +; CHECK: *(u16 *)(r1 + 2) = r2 +; EL: r1 = 134678021 +; EB: r1 = 84281096 +; CHECK: *(u32 *)(r10 - 8) = r1 +; CHECK: r1 = 9 +; CHECK: *(u8 *)(r10 - 4) = r1 +; CHECK: r1 = 10 +; CHECK: *(u8 *)(r10 - 3) = r1 +; CHECK: *(u16 *)(r10 + 24) = r2 +; CHECK: *(u16 *)(r10 + 22) = r2 +; CHECK: *(u16 *)(r10 + 20) = r2 +; CHECK: *(u16 *)(r10 + 18) = r2 +; CHECK: *(u16 *)(r10 + 16) = r2 +; CHECK: *(u16 *)(r10 + 14) = r2 +; CHECK: *(u16 *)(r10 + 12) = r2 +; CHECK: *(u16 *)(r10 + 10) = r2 +; CHECK: *(u16 *)(r10 + 8) = r2 +; CHECK: *(u16 *)(r10 + 6) = r2 +; CHECK: *(u16 *)(r10 - 2) = r2 +; CHECK: *(u16 *)(r10 + 26) = r2 ; CHECK: r2 = r10 ; CHECK: r2 += -8 ; CHECK: r1 = <MCOperand Expr:(routing)>ll diff --git a/test/CodeGen/Generic/pr33094.ll b/test/CodeGen/Generic/pr33094.ll new file mode 100644 index 0000000000000..afa464f63f663 --- /dev/null +++ b/test/CodeGen/Generic/pr33094.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s + +; PR33094 +; Make sure that a constant extractvalue doesn't cause a crash in +; SelectionDAGBuilder::visitExtractValue. + +%A = type {} +%B = type {} +%Tuple = type { i64 } + +@A_Inst = global %A zeroinitializer +@B_Inst = global %B zeroinitializer + +define i64 @foo() { + ret i64 extractvalue (%Tuple select (i1 icmp eq + (%B* bitcast (%A* @A_Inst to %B*), %B* @B_Inst), + %Tuple { i64 33 }, %Tuple { i64 42 }), 0) +} diff --git a/test/CodeGen/Hexagon/convertdptoint.ll b/test/CodeGen/Hexagon/convertdptoint.ll index a09c2fd14b120..adf76e5dc82eb 100644 --- a/test/CodeGen/Hexagon/convertdptoint.ll +++ b/test/CodeGen/Hexagon/convertdptoint.ll @@ -12,10 +12,10 @@ entry: %b = alloca double, align 8 %c = alloca double, align 8 store i32 0, i32* %retval - store double 1.540000e+01, double* %a, align 8 - store double 9.100000e+00, double* %b, align 8 - %0 = load double, double* %a, align 8 - %1 = load double, double* %b, align 8 + store volatile double 1.540000e+01, double* %a, align 8 + store volatile double 9.100000e+00, double* %b, align 8 + %0 = load volatile double, double* %a, align 8 + %1 = load volatile double, double* %b, align 8 %add = fadd double %0, %1 store double %add, double* %c, align 8 %2 = load double, double* %c, align 8 diff --git a/test/CodeGen/Hexagon/convertdptoll.ll b/test/CodeGen/Hexagon/convertdptoll.ll index f46d46cf76b18..6b5bf56a248bc 100644 --- a/test/CodeGen/Hexagon/convertdptoll.ll +++ b/test/CodeGen/Hexagon/convertdptoll.ll @@ -17,8 +17,8 @@ entry: %0 = load double, double* %a, align 8 %1 = load double, double* %b, align 8 %add = fadd double %0, %1 - store double %add, double* %c, align 8 - %2 = load double, double* %c, align 8 + store volatile double %add, double* %c, align 8 + %2 = load volatile double, double* %c, align 8 %conv = fptosi double %2 to i64 store i64 %conv, i64* %i, align 8 %3 = load i64, i64* %i, align 8 diff --git a/test/CodeGen/Hexagon/convertsptoint.ll b/test/CodeGen/Hexagon/convertsptoint.ll index 7593e57d852f4..939b3b06a8c79 100644 --- a/test/CodeGen/Hexagon/convertsptoint.ll +++ b/test/CodeGen/Hexagon/convertsptoint.ll @@ -17,8 +17,8 @@ entry: %0 = load float, float* %a, align 4 %1 = load float, float* %b, align 4 %add = fadd float %0, %1 - store float %add, float* %c, align 4 - %2 = load float, float* %c, align 4 + store volatile float %add, float* %c, align 4 + %2 = load volatile float, float* %c, align 4 %conv = fptosi float %2 to i32 store i32 %conv, i32* %i, align 4 %3 = load i32, i32* %i, align 4 diff --git a/test/CodeGen/Hexagon/convertsptoll.ll b/test/CodeGen/Hexagon/convertsptoll.ll index d8432cbc812bc..f540397ccf5e5 100644 --- a/test/CodeGen/Hexagon/convertsptoll.ll +++ b/test/CodeGen/Hexagon/convertsptoll.ll @@ -17,8 +17,8 @@ entry: %0 = load float, float* %a, align 4 %1 = load float, float* %b, align 4 %add = fadd float %0, %1 - store float %add, float* %c, align 4 - %2 = load float, float* %c, align 4 + store volatile float %add, float* %c, align 4 + %2 = load volatile float, float* %c, align 4 %conv = fptosi float %2 to i64 store i64 %conv, i64* %i, align 8 %3 = load i64, i64* %i, align 8 diff --git a/test/CodeGen/Hexagon/dadd.ll b/test/CodeGen/Hexagon/dadd.ll index 5fcd705bab232..3068f499d12df 100644 --- a/test/CodeGen/Hexagon/dadd.ll +++ b/test/CodeGen/Hexagon/dadd.ll @@ -9,10 +9,10 @@ entry: %a = alloca double, align 8 %b = alloca double, align 8 %c = alloca double, align 8 - store double 1.540000e+01, double* %a, align 8 - store double 9.100000e+00, double* %b, align 8 - %0 = load double, double* %a, align 8 - %1 = load double, double* %b, align 8 + store volatile double 1.540000e+01, double* %a, align 8 + store volatile double 9.100000e+00, double* %b, align 8 + %0 = load volatile double, double* %a, align 8 + %1 = load volatile double, double* %b, align 8 %add = fadd double %0, %1 store double %add, double* %c, align 8 ret i32 0 diff --git a/test/CodeGen/Hexagon/dmul.ll b/test/CodeGen/Hexagon/dmul.ll index 1b79e0aa7d701..a6cf62b0c0aae 100644 --- a/test/CodeGen/Hexagon/dmul.ll +++ b/test/CodeGen/Hexagon/dmul.ll @@ -8,10 +8,10 @@ entry: %a = alloca double, align 8 %b = alloca double, align 8 %c = alloca double, align 8 - store double 1.540000e+01, double* %a, align 8 - store double 9.100000e+00, double* %b, align 8 - %0 = load double, double* %b, align 8 - %1 = load double, double* %a, align 8 + store volatile double 1.540000e+01, double* %a, align 8 + store volatile double 9.100000e+00, double* %b, align 8 + %0 = load volatile double, double* %b, align 8 + %1 = load volatile double, double* %a, align 8 %mul = fmul double %0, %1 store double %mul, double* %c, align 8 ret i32 0 diff --git a/test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll b/test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll index 6bf8224904ec0..ccc287c5f2bcb 100644 --- a/test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll +++ b/test/CodeGen/Hexagon/doubleconvert-ieee-rnd-near.ll @@ -12,10 +12,10 @@ entry: %b = alloca double, align 8 %c = alloca double, align 8 store i32 0, i32* %retval - store double 1.540000e+01, double* %a, align 8 - store double 9.100000e+00, double* %b, align 8 - %0 = load double, double* %a, align 8 - %1 = load double, double* %b, align 8 + store volatile double 1.540000e+01, double* %a, align 8 + store volatile double 9.100000e+00, double* %b, align 8 + %0 = load volatile double, double* %a, align 8 + %1 = load volatile double, double* %b, align 8 %add = fadd double %0, %1 store double %add, double* %c, align 8 %2 = load double, double* %c, align 8 diff --git a/test/CodeGen/Hexagon/dsub.ll b/test/CodeGen/Hexagon/dsub.ll index 8b37301d84fbd..d7e44b307cf8d 100644 --- a/test/CodeGen/Hexagon/dsub.ll +++ b/test/CodeGen/Hexagon/dsub.ll @@ -8,10 +8,10 @@ entry: %a = alloca double, align 8 %b = alloca double, align 8 %c = alloca double, align 8 - store double 1.540000e+01, double* %a, align 8 - store double 9.100000e+00, double* %b, align 8 - %0 = load double, double* %b, align 8 - %1 = load double, double* %a, align 8 + store volatile double 1.540000e+01, double* %a, align 8 + store volatile double 9.100000e+00, double* %b, align 8 + %0 = load volatile double, double* %b, align 8 + %1 = load volatile double, double* %a, align 8 %sub = fsub double %0, %1 store double %sub, double* %c, align 8 ret i32 0 diff --git a/test/CodeGen/Hexagon/fadd.ll b/test/CodeGen/Hexagon/fadd.ll index 0418c1724f5bd..65c6182dcc77f 100644 --- a/test/CodeGen/Hexagon/fadd.ll +++ b/test/CodeGen/Hexagon/fadd.ll @@ -8,10 +8,10 @@ entry: %a = alloca float, align 4 %b = alloca float, align 4 %c = alloca float, align 4 - store float 0x402ECCCCC0000000, float* %a, align 4 - store float 0x4022333340000000, float* %b, align 4 - %0 = load float, float* %a, align 4 - %1 = load float, float* %b, align 4 + store volatile float 0x402ECCCCC0000000, float* %a, align 4 + store volatile float 0x4022333340000000, float* %b, align 4 + %0 = load volatile float, float* %a, align 4 + %1 = load volatile float, float* %b, align 4 %add = fadd float %0, %1 store float %add, float* %c, align 4 ret i32 0 diff --git a/test/CodeGen/Hexagon/fmul.ll b/test/CodeGen/Hexagon/fmul.ll index 552f98ec7a53a..e20e293c0a137 100644 --- a/test/CodeGen/Hexagon/fmul.ll +++ b/test/CodeGen/Hexagon/fmul.ll @@ -9,10 +9,10 @@ entry: %a = alloca float, align 4 %b = alloca float, align 4 %c = alloca float, align 4 - store float 0x402ECCCCC0000000, float* %a, align 4 - store float 0x4022333340000000, float* %b, align 4 - %0 = load float, float* %b, align 4 - %1 = load float, float* %a, align 4 + store volatile float 0x402ECCCCC0000000, float* %a, align 4 + store volatile float 0x4022333340000000, float* %b, align 4 + %0 = load volatile float, float* %b, align 4 + %1 = load volatile float, float* %a, align 4 %mul = fmul float %0, %1 store float %mul, float* %c, align 4 ret i32 0 diff --git a/test/CodeGen/Hexagon/fsub.ll b/test/CodeGen/Hexagon/fsub.ll index d7b0e2f65b33c..e9a1fa3d192bc 100644 --- a/test/CodeGen/Hexagon/fsub.ll +++ b/test/CodeGen/Hexagon/fsub.ll @@ -8,10 +8,10 @@ entry: %a = alloca float, align 4 %b = alloca float, align 4 %c = alloca float, align 4 - store float 0x402ECCCCC0000000, float* %a, align 4 - store float 0x4022333340000000, float* %b, align 4 - %0 = load float, float* %b, align 4 - %1 = load float, float* %a, align 4 + store volatile float 0x402ECCCCC0000000, float* %a, align 4 + store volatile float 0x4022333340000000, float* %b, align 4 + %0 = load volatile float, float* %b, align 4 + %1 = load volatile float, float* %a, align 4 %sub = fsub float %0, %1 store float %sub, float* %c, align 4 ret i32 0 diff --git a/test/CodeGen/Hexagon/hasfp-crash1.ll b/test/CodeGen/Hexagon/hasfp-crash1.ll new file mode 100644 index 0000000000000..1154a7117a70a --- /dev/null +++ b/test/CodeGen/Hexagon/hasfp-crash1.ll @@ -0,0 +1,82 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s +; +; Check that this testcase does not crash. +; CHECK: jumpr r31 + +target triple = "hexagon" + +; Function Attrs: nounwind +declare i32 @foo0(i32*, i32, i64, i32, i8 zeroext, i8 zeroext, i32) local_unnamed_addr #0 + +; Function Attrs: nounwind +define i32 @foo1(i32* %a0, i32 %a1, i32 %a2, i32 %a3, i8 zeroext %a4, i8 zeroext %a5, i32 %a6) local_unnamed_addr #0 !dbg !33 { +entry: + tail call void @llvm.dbg.value(metadata i32 %a6, i64 0, metadata !51, metadata !52), !dbg !53 + ret i32 undef, !dbg !54 +} + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 + +attributes #0 = { nounwind "disable-tail-calls"="true" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv5" "target-features"="-hvx-double,-long-calls" } +attributes #1 = { nounwind readnone speculatable } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!26, !27} +!llvm.linker.options = !{!29, !30, !31, !32, !29, !30, !31, !32} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !22) +!1 = !DIFile(filename: "foo.i", directory: "/path") +!2 = !{!3, !16} +!3 = !DICompositeType(tag: DW_TAG_enumeration_type, file: !4, line: 122, size: 8, elements: !5) +!4 = !DIFile(filename: "foo.h", directory: "/path") +!5 = !{!6, !7, !8, !9, !10, !11, !12, !13, !14, !15} +!6 = !DIEnumerator(name: "E0", value: 7) +!7 = !DIEnumerator(name: "E1", value: 6) +!8 = !DIEnumerator(name: "E2", value: 5) +!9 = !DIEnumerator(name: "E3", value: 0) +!10 = !DIEnumerator(name: "E4", value: 1) +!11 = !DIEnumerator(name: "E5", value: 7) +!12 = !DIEnumerator(name: "E6", value: 5) +!13 = !DIEnumerator(name: "E7", value: 4) +!14 = !DIEnumerator(name: "E8", value: 4) +!15 = !DIEnumerator(name: "E9", value: 10) +!16 = !DICompositeType(tag: DW_TAG_enumeration_type, file: !4, line: 136, size: 8, elements: !17) +!17 = !{!18, !19, !20, !21} +!18 = !DIEnumerator(name: "F0", value: 1) +!19 = !DIEnumerator(name: "F1", value: 2) +!20 = !DIEnumerator(name: "F2", value: 4) +!21 = !DIEnumerator(name: "F3", value: 7) +!22 = !{!23, !24, !25} +!23 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) +!24 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned) +!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "t0_t", file: !4, line: 38, baseType: !24) +!26 = !{i32 2, !"Debug Info Version", i32 3} +!27 = !{i32 6, !"Linker Options", !28} +!28 = !{!29, !30, !31, !32} +!29 = !{!"foo0", !".text"} +!30 = !{!"foo1", !".text"} +!31 = !{!"foo2", !".text"} +!32 = !{!"foo3", !".text"} +!33 = distinct !DISubprogram(name: "foo1", scope: !34, file: !34, line: 84, type: !35, isLocal: false, isDefinition: true, scopeLine: 85, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !44) +!34 = !DIFile(filename: "foo.c", directory: "/path") +!35 = !DISubroutineType(types: !36) +!36 = !{!37, !38, !39, !40, !41, !42, !43, !37} +!37 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!38 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 32) +!39 = !DIDerivedType(tag: DW_TAG_typedef, name: "t1_t", file: !4, line: 35, baseType: !23) +!40 = !DIDerivedType(tag: DW_TAG_typedef, name: "t2_t", file: !4, line: 36, baseType: !23) +!41 = !DIDerivedType(tag: DW_TAG_typedef, name: "t3_t", file: !4, line: 43, baseType: !23) +!42 = !DIDerivedType(tag: DW_TAG_typedef, name: "t4_t", file: !4, line: 133, baseType: !3) +!43 = !DIDerivedType(tag: DW_TAG_typedef, name: "t5_t", file: !4, line: 141, baseType: !16) +!44 = !{!45, !46, !47, !48, !49, !50, !51} +!45 = !DILocalVariable(name: "a0", arg: 1, scope: !33, file: !34, line: 84, type: !38) +!46 = !DILocalVariable(name: "a1", arg: 2, scope: !33, file: !34, line: 84, type: !39) +!47 = !DILocalVariable(name: "a2", arg: 3, scope: !33, file: !34, line: 84, type: !40) +!48 = !DILocalVariable(name: "a3", arg: 4, scope: !33, file: !34, line: 84, type: !41) +!49 = !DILocalVariable(name: "a4", arg: 5, scope: !33, file: !34, line: 84, type: !42) +!50 = !DILocalVariable(name: "a5", arg: 6, scope: !33, file: !34, line: 84, type: !43) +!51 = !DILocalVariable(name: "a6", arg: 7, scope: !33, file: !34, line: 84, type: !37) +!52 = !DIExpression() +!53 = !DILocation(line: 84, column: 169, scope: !33) +!54 = !DILocation(line: 86, column: 5, scope: !33) diff --git a/test/CodeGen/Hexagon/hasfp-crash2.ll b/test/CodeGen/Hexagon/hasfp-crash2.ll new file mode 100644 index 0000000000000..c8b49948ce74e --- /dev/null +++ b/test/CodeGen/Hexagon/hasfp-crash2.ll @@ -0,0 +1,83 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s +; +; Check that this testcase does not crash. +; CHECK: call foo0 + +target triple = "hexagon" + +; Function Attrs: nounwind +declare void @foo0() local_unnamed_addr #0 + +; Function Attrs: nounwind +define void @foo1() local_unnamed_addr #0 !dbg !33 { +entry: + tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !51, metadata !52), !dbg !53 + tail call void @foo0(), !dbg !54 + ret void +} + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 + +attributes #0 = { nounwind "disable-tail-calls"="true" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv5" "target-features"="-hvx-double,-long-calls" } +attributes #1 = { nounwind readnone speculatable } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!26, !27} +!llvm.linker.options = !{!29, !30, !31, !32, !29, !30, !31, !32, !29, !30, !31, !32, !29, !30, !31, !32, !29, !30, !31, !32, !29, !30, !31, !32} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !22) +!1 = !DIFile(filename: "foo.i", directory: "/path") +!2 = !{!3, !16} +!3 = !DICompositeType(tag: DW_TAG_enumeration_type, file: !4, line: 122, size: 8, elements: !5) +!4 = !DIFile(filename: "foo.h", directory: "/path") +!5 = !{!6, !7, !8, !9, !10, !11, !12, !13, !14, !15} +!6 = !DIEnumerator(name: "E0", value: 7) +!7 = !DIEnumerator(name: "E1", value: 6) +!8 = !DIEnumerator(name: "E2", value: 5) +!9 = !DIEnumerator(name: "E3", value: 0) +!10 = !DIEnumerator(name: "E4", value: 1) +!11 = !DIEnumerator(name: "E5", value: 7) +!12 = !DIEnumerator(name: "E6", value: 5) +!13 = !DIEnumerator(name: "E7", value: 4) +!14 = !DIEnumerator(name: "E8", value: 4) +!15 = !DIEnumerator(name: "E9", value: 10) +!16 = !DICompositeType(tag: DW_TAG_enumeration_type, file: !4, line: 136, size: 8, elements: !17) +!17 = !{!18, !19, !20, !21} +!18 = !DIEnumerator(name: "F0", value: 1) +!19 = !DIEnumerator(name: "F1", value: 2) +!20 = !DIEnumerator(name: "F2", value: 4) +!21 = !DIEnumerator(name: "F3", value: 7) +!22 = !{!23, !24, !25} +!23 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) +!24 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned) +!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "t0_t", file: !4, line: 38, baseType: !24) +!26 = !{i32 2, !"Debug Info Version", i32 3} +!27 = !{i32 6, !"Linker Options", !28} +!28 = !{!29, !30, !31, !32} +!29 = !{!"foo0", !".text"} +!30 = !{!"foo1", !".text"} +!31 = !{!"foo2", !".text"} +!32 = !{!"foo3", !".text"} +!33 = distinct !DISubprogram(name: "foo1", scope: !34, file: !34, line: 84, type: !35, isLocal: false, isDefinition: true, scopeLine: 85, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !44) +!34 = !DIFile(filename: "foo.c", directory: "/path") +!35 = !DISubroutineType(types: !36) +!36 = !{!37, !38, !39, !40, !41, !42, !43, !37} +!37 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!38 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 32) +!39 = !DIDerivedType(tag: DW_TAG_typedef, name: "t1_t", file: !4, line: 35, baseType: !23) +!40 = !DIDerivedType(tag: DW_TAG_typedef, name: "t2_t", file: !4, line: 36, baseType: !23) +!41 = !DIDerivedType(tag: DW_TAG_typedef, name: "t3_t", file: !4, line: 43, baseType: !23) +!42 = !DIDerivedType(tag: DW_TAG_typedef, name: "t4_t", file: !4, line: 133, baseType: !3) +!43 = !DIDerivedType(tag: DW_TAG_typedef, name: "t5_t", file: !4, line: 141, baseType: !16) +!44 = !{!45, !46, !47, !48, !49, !50, !51} +!45 = !DILocalVariable(name: "a0", arg: 1, scope: !33, file: !34, line: 84, type: !38) +!46 = !DILocalVariable(name: "a1", arg: 2, scope: !33, file: !34, line: 84, type: !39) +!47 = !DILocalVariable(name: "a2", arg: 3, scope: !33, file: !34, line: 84, type: !40) +!48 = !DILocalVariable(name: "a3", arg: 4, scope: !33, file: !34, line: 84, type: !41) +!49 = !DILocalVariable(name: "a4", arg: 5, scope: !33, file: !34, line: 84, type: !42) +!50 = !DILocalVariable(name: "a5", arg: 6, scope: !33, file: !34, line: 84, type: !43) +!51 = !DILocalVariable(name: "a6", arg: 7, scope: !33, file: !34, line: 84, type: !37) +!52 = !DIExpression() +!53 = !DILocation(line: 84, column: 169, scope: !33) +!54 = !DILocation(line: 86, column: 12, scope: !33) diff --git a/test/CodeGen/Hexagon/hvx-nontemporal.ll b/test/CodeGen/Hexagon/hvx-nontemporal.ll new file mode 100644 index 0000000000000..98c5ef4809b08 --- /dev/null +++ b/test/CodeGen/Hexagon/hvx-nontemporal.ll @@ -0,0 +1,28 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s +target triple = "hexagon" + +; Function Attrs: norecurse nounwind +define void @test(<32 x i32>* nocapture readonly %x, <32 x i32>* nocapture readnone %y, <32 x i32>* nocapture %a, <32 x i32>* nocapture %b) #0 { +entry: +; CHECK: v0 = vmem(r0+#7):nt + %add.ptr = getelementptr inbounds <32 x i32>, <32 x i32>* %x, i32 7 + %0 = load <32 x i32>, <32 x i32>* %add.ptr, align 128, !tbaa !1, !nontemporal !4 + +; CHECK: v1.cur = vmem(r2+#0):nt + %1 = load <32 x i32>, <32 x i32>* %a, align 128, !tbaa !1, !nontemporal !4 + +; CHECK: vmem(r3+#3):nt = v1 + %add.ptr2 = getelementptr inbounds <32 x i32>, <32 x i32>* %b, i32 3 + store <32 x i32> %1, <32 x i32>* %add.ptr2, align 128, !tbaa !1, !nontemporal !4 + +; CHECK: vmem(r2+#0):nt = v0 + store <32 x i32> %0, <32 x i32>* %a, align 128, !tbaa !1, !nontemporal !4 + ret void +} + +attributes #0 = { norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-double" } + +!1 = !{!2, !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C/C++ TBAA"} +!4 = !{i32 1} diff --git a/test/CodeGen/Hexagon/target-flag-ext.mir b/test/CodeGen/Hexagon/target-flag-ext.mir new file mode 100644 index 0000000000000..49e0d2870e00f --- /dev/null +++ b/test/CodeGen/Hexagon/target-flag-ext.mir @@ -0,0 +1,24 @@ +# RUN: llc -march=hexagon -run-pass hexagon-packetizer -o - %s | FileCheck %s +--- +name: fred +tracksRegLiveness: true + +body: | + bb.0: + ; Check that all these instructions go in the same packet. This is to + ; make sure that a target flag (other than HMOTF_ConstExtend) on an + ; operand will not be interpreted as a constant-extender flag. + ; The combination used below (pcrel + 0) does not technically make sense, + ; but combinations that do make sense require constant extending, so + ; testing this is not possible otherwise. + + ; CHECK: BUNDLE + ; CHECK-DAG: %r0 = A2_tfrsi + ; CHECK-DAG: %r1 = A2_tfrsi + ; CHECK-DAG: %r2 = A2_tfrsi + ; CHECK: } + %r0 = A2_tfrsi target-flags (hexagon-pcrel) 0 + %r1 = A2_tfrsi target-flags (hexagon-pcrel) 0 + %r2 = A2_tfrsi target-flags (hexagon-pcrel) 0 +... + diff --git a/test/CodeGen/MIR/AArch64/atomic-memoperands.mir b/test/CodeGen/MIR/AArch64/atomic-memoperands.mir index 1fe42a7314881..1c81f580bee53 100644 --- a/test/CodeGen/MIR/AArch64/atomic-memoperands.mir +++ b/test/CodeGen/MIR/AArch64/atomic-memoperands.mir @@ -14,7 +14,7 @@ # CHECK: %3(s16) = G_LOAD %0(p0) :: (load acquire 2) # CHECK: G_STORE %3(s16), %0(p0) :: (store release 2) # CHECK: G_STORE %2(s32), %0(p0) :: (store acq_rel 4) -# CHECK: G_STORE %1(s64), %0(p0) :: (store singlethread seq_cst 8) +# CHECK: G_STORE %1(s64), %0(p0) :: (store syncscope("singlethread") seq_cst 8) name: atomic_memoperands body: | bb.0: @@ -25,6 +25,6 @@ body: | %3:_(s16) = G_LOAD %0(p0) :: (load acquire 2) G_STORE %3(s16), %0(p0) :: (store release 2) G_STORE %2(s32), %0(p0) :: (store acq_rel 4) - G_STORE %1(s64), %0(p0) :: (store singlethread seq_cst 8) + G_STORE %1(s64), %0(p0) :: (store syncscope("singlethread") seq_cst 8) RET_ReallyLR ... diff --git a/test/CodeGen/MIR/AArch64/invalid-target-memoperands.mir b/test/CodeGen/MIR/AArch64/invalid-target-memoperands.mir new file mode 100644 index 0000000000000..731d7165b9df9 --- /dev/null +++ b/test/CodeGen/MIR/AArch64/invalid-target-memoperands.mir @@ -0,0 +1,19 @@ +# RUN: not llc -mtriple=aarch64-none-linux-gnu -run-pass none -o /dev/null %s 2>&1 | FileCheck %s + +--- | + + define void @target_memoperands_error() { + ret void + } + +... +--- +name: target_memoperands_error +body: | + bb.0: + + %0:_(p0) = COPY %x0 + ; CHECK: [[@LINE+1]]:35: use of undefined target MMO flag 'aarch64-invalid' + %1:_(s64) = G_LOAD %0(p0) :: ("aarch64-invalid" load 8) + RET_ReallyLR +... diff --git a/test/CodeGen/MIR/AArch64/target-memoperands.mir b/test/CodeGen/MIR/AArch64/target-memoperands.mir new file mode 100644 index 0000000000000..f853b551e0986 --- /dev/null +++ b/test/CodeGen/MIR/AArch64/target-memoperands.mir @@ -0,0 +1,22 @@ +# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass none -o - %s | FileCheck %s + +--- | + + define void @target_memoperands() { + ret void + } + +... +--- +# CHECK-LABEL: name: target_memoperands +# CHECK: %1(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8) +# CHECK: G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8) +name: target_memoperands +body: | + bb.0: + + %0:_(p0) = COPY %x0 + %1:_(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8) + G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8) + RET_ReallyLR +... diff --git a/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir b/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir index 7cef01c9d12d9..c0251232fd5c7 100644 --- a/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir +++ b/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir @@ -171,8 +171,8 @@ body: | # CHECK-LABEL: name: add_f32_1.0_multi_f16_use # CHECK: %13 = V_MOV_B32_e32 1065353216, implicit %exec -# CHECK: %14 = V_ADD_F16_e32 %13, killed %11, implicit %exec -# CHECK: %15 = V_ADD_F16_e32 killed %13, killed %12, implicit %exec +# CHECK: %14 = V_ADD_F16_e32 killed %11, %13, implicit %exec +# CHECK: %15 = V_ADD_F16_e32 killed %12, killed %13, implicit %exec name: add_f32_1.0_multi_f16_use @@ -307,8 +307,8 @@ body: | # CHECK-LABEL: name: add_f32_1.0_one_f32_use_multi_f16_use # CHECK: %14 = V_MOV_B32_e32 1065353216, implicit %exec -# CHECK: %15 = V_ADD_F16_e32 %14, %11, implicit %exec -# CHECK: %16 = V_ADD_F16_e32 %14, %12, implicit %exec +# CHECK: %15 = V_ADD_F16_e32 %11, %14, implicit %exec +# CHECK: %16 = V_ADD_F16_e32 %12, %14, implicit %exec # CHECK: %17 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec name: add_f32_1.0_one_f32_use_multi_f16_use @@ -514,8 +514,8 @@ body: | # CHECK-LABEL: name: add_f16_1.0_multi_f32_use # CHECK: %13 = V_MOV_B32_e32 15360, implicit %exec -# CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec -# CHECK: %15 = V_ADD_F32_e32 %13, %12, implicit %exec +# CHECK: %14 = V_ADD_F32_e32 %11, %13, implicit %exec +# CHECK: %15 = V_ADD_F32_e32 %12, %13, implicit %exec name: add_f16_1.0_multi_f32_use alignment: 0 @@ -581,8 +581,8 @@ body: | # CHECK-LABEL: name: add_f16_1.0_other_high_bits_multi_f16_use # CHECK: %13 = V_MOV_B32_e32 80886784, implicit %exec -# CHECK: %14 = V_ADD_F16_e32 %13, %11, implicit %exec -# CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec +# CHECK: %14 = V_ADD_F16_e32 %11, %13, implicit %exec +# CHECK: %15 = V_ADD_F16_e32 %12, %13, implicit %exec name: add_f16_1.0_other_high_bits_multi_f16_use alignment: 0 @@ -648,8 +648,8 @@ body: | # CHECK-LABEL: name: add_f16_1.0_other_high_bits_use_f16_f32 # CHECK: %13 = V_MOV_B32_e32 305413120, implicit %exec -# CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec -# CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec +# CHECK: %14 = V_ADD_F32_e32 %11, %13, implicit %exec +# CHECK: %15 = V_ADD_F16_e32 %12, %13, implicit %exec name: add_f16_1.0_other_high_bits_use_f16_f32 alignment: 0 exposesReturnsTwice: false diff --git a/test/CodeGen/MIR/AMDGPU/syncscopes.mir b/test/CodeGen/MIR/AMDGPU/syncscopes.mir new file mode 100644 index 0000000000000..83506257d8bf8 --- /dev/null +++ b/test/CodeGen/MIR/AMDGPU/syncscopes.mir @@ -0,0 +1,98 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -run-pass=none %s -o - | FileCheck --check-prefix=GCN %s + +--- | + ; ModuleID = '<stdin>' + source_filename = "<stdin>" + target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" + target triple = "amdgcn-amd-amdhsa" + + define void @syncscopes(i32 %agent, i32 addrspace(4)* %agent_out, i32 %workgroup, i32 addrspace(4)* %workgroup_out, i32 %wavefront, i32 addrspace(4)* %wavefront_out) #0 { + entry: + store atomic i32 %agent, i32 addrspace(4)* %agent_out syncscope("agent") seq_cst, align 4 + store atomic i32 %workgroup, i32 addrspace(4)* %workgroup_out syncscope("workgroup") seq_cst, align 4 + store atomic i32 %wavefront, i32 addrspace(4)* %wavefront_out syncscope("wavefront") seq_cst, align 4 + ret void + } + + ; Function Attrs: convergent nounwind + declare { i1, i64 } @llvm.amdgcn.if(i1) #1 + + ; Function Attrs: convergent nounwind + declare { i1, i64 } @llvm.amdgcn.else(i64) #1 + + ; Function Attrs: convergent nounwind readnone + declare i64 @llvm.amdgcn.break(i64) #2 + + ; Function Attrs: convergent nounwind readnone + declare i64 @llvm.amdgcn.if.break(i1, i64) #2 + + ; Function Attrs: convergent nounwind readnone + declare i64 @llvm.amdgcn.else.break(i64, i64) #2 + + ; Function Attrs: convergent nounwind + declare i1 @llvm.amdgcn.loop(i64) #1 + + ; Function Attrs: convergent nounwind + declare void @llvm.amdgcn.end.cf(i64) #1 + + attributes #0 = { "target-cpu"="gfx803" } + attributes #1 = { convergent nounwind } + attributes #2 = { convergent nounwind readnone } + +# GCN-LABEL: name: syncscopes +# GCN: FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("agent") seq_cst 4 into %ir.agent_out) +# GCN: FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out) +# GCN: FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out) +... +--- +name: syncscopes +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%sgpr4_sgpr5' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.entry: + liveins: %sgpr4_sgpr5 + + S_WAITCNT 0 + %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM %sgpr4_sgpr5, 8, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %sgpr6 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`) + %sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM %sgpr4_sgpr5, 24, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %sgpr7 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 16, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`) + %sgpr8 = S_LOAD_DWORD_IMM %sgpr4_sgpr5, 32, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`) + S_WAITCNT 127 + %vgpr0 = V_MOV_B32_e32 %sgpr0, implicit %exec, implicit-def %vgpr0_vgpr1, implicit %sgpr0_sgpr1 + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr4_sgpr5, 40, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %vgpr1 = V_MOV_B32_e32 killed %sgpr1, implicit %exec, implicit killed %sgpr0_sgpr1, implicit %sgpr0_sgpr1, implicit %exec + %vgpr2 = V_MOV_B32_e32 killed %sgpr6, implicit %exec, implicit %exec + FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("agent") seq_cst 4 into %ir.agent_out) + S_WAITCNT 112 + %vgpr0 = V_MOV_B32_e32 %sgpr2, implicit %exec, implicit-def %vgpr0_vgpr1, implicit %sgpr2_sgpr3 + %vgpr1 = V_MOV_B32_e32 killed %sgpr3, implicit %exec, implicit killed %sgpr2_sgpr3, implicit %sgpr2_sgpr3, implicit %exec + %vgpr2 = V_MOV_B32_e32 killed %sgpr7, implicit %exec, implicit %exec + FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out) + S_WAITCNT 112 + %vgpr0 = V_MOV_B32_e32 %sgpr4, implicit %exec, implicit-def %vgpr0_vgpr1, implicit %sgpr4_sgpr5 + %vgpr1 = V_MOV_B32_e32 killed %sgpr5, implicit %exec, implicit killed %sgpr4_sgpr5, implicit %sgpr4_sgpr5, implicit %exec + %vgpr2 = V_MOV_B32_e32 killed %sgpr8, implicit %exec, implicit %exec + FLAT_STORE_DWORD killed %vgpr0_vgpr1, killed %vgpr2, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out) + S_ENDPGM + +... diff --git a/test/CodeGen/MIR/AMDGPU/target-flags.mir b/test/CodeGen/MIR/AMDGPU/target-flags.mir new file mode 100644 index 0000000000000..7d288dd1b0450 --- /dev/null +++ b/test/CodeGen/MIR/AMDGPU/target-flags.mir @@ -0,0 +1,29 @@ +# RUN: llc -march=amdgcn -run-pass none -o - %s | FileCheck %s +--- | + define amdgpu_kernel void @flags() { + ret void + } + + declare void @foo() +... +--- + +# CHECK: SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead %scc +# CHECK: %1 = S_MOV_B64 target-flags(amdgpu-gotprel) @foo + +name: flags +liveins: + - { reg: '%sgpr0_sgpr1' } +frameInfo: + maxAlignment: 8 +registers: + - { id: 0, class: sreg_64, preferred-register: '' } + - { id: 1, class: sreg_64, preferred-register: '' } +body: | + bb.0: + liveins: %sgpr0_sgpr1 + %0 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead %scc + %1 = S_MOV_B64 target-flags(amdgpu-gotprel) @foo + + S_ENDPGM +... diff --git a/test/CodeGen/MIR/Generic/runPass.mir b/test/CodeGen/MIR/Generic/runPass.mir index 33380d4c6bb4a..54c1dd221bdb7 100644 --- a/test/CodeGen/MIR/Generic/runPass.mir +++ b/test/CodeGen/MIR/Generic/runPass.mir @@ -1,5 +1,6 @@ # RUN: llc -run-pass=greedy -debug-pass=Arguments -o - %s | FileCheck %s # RUN: llc -run-pass=regallocbasic -debug-pass=Arguments -o - %s | FileCheck %s +# RUN: llc -run-pass=regallocfast -debug-pass=Arguments -o - %s | FileCheck %s # Check that passes are initialized correctly, so that it's possible to # use -run-pass. @@ -7,6 +8,7 @@ --- # CHECK: name: foo name: foo +tracksRegLiveness: true body: | bb.0: ... diff --git a/test/CodeGen/MIR/Hexagon/target-flags.mir b/test/CodeGen/MIR/Hexagon/target-flags.mir new file mode 100644 index 0000000000000..656e0a6ea8596 --- /dev/null +++ b/test/CodeGen/MIR/Hexagon/target-flags.mir @@ -0,0 +1,36 @@ +# RUN: llc -march=hexagon -run-pass none -o - %s | FileCheck %s +--- +name: fred + +body: | + bb.0: + + ; CHECK: target-flags(hexagon-pcrel) + %r0 = A2_tfrsi target-flags (hexagon-pcrel) 0 + ; CHECK: target-flags(hexagon-got) + %r0 = A2_tfrsi target-flags (hexagon-got) 0 + ; CHECK: target-flags(hexagon-lo16) + %r0 = A2_tfrsi target-flags (hexagon-lo16) 0 + ; CHECK: target-flags(hexagon-hi16) + %r0 = A2_tfrsi target-flags (hexagon-hi16) 0 + ; CHECK: target-flags(hexagon-gprel) + %r0 = A2_tfrsi target-flags (hexagon-gprel) 0 + ; CHECK: target-flags(hexagon-gdgot) + %r0 = A2_tfrsi target-flags (hexagon-gdgot) 0 + ; CHECK: target-flags(hexagon-gdplt) + %r0 = A2_tfrsi target-flags (hexagon-gdplt) 0 + ; CHECK: target-flags(hexagon-ie) + %r0 = A2_tfrsi target-flags (hexagon-ie) 0 + ; CHECK: target-flags(hexagon-iegot) + %r0 = A2_tfrsi target-flags (hexagon-iegot) 0 + ; CHECK: target-flags(hexagon-tprel) + %r0 = A2_tfrsi target-flags (hexagon-tprel) 0 + + ; CHECK: target-flags(hexagon-ext) + %r0 = A2_tfrsi target-flags (hexagon-ext) 0 + ; CHECK: target-flags(hexagon-pcrel, hexagon-ext) + %r0 = A2_tfrsi target-flags (hexagon-pcrel,hexagon-ext) 0 + ; CHECK: target-flags(hexagon-ie, hexagon-ext) + %r0 = A2_tfrsi target-flags (hexagon-ie,hexagon-ext) 0 +... + diff --git a/test/CodeGen/MIR/X86/tied-physical-regs-match.mir b/test/CodeGen/MIR/X86/tied-physical-regs-match.mir new file mode 100644 index 0000000000000..1ddf649f76a7c --- /dev/null +++ b/test/CodeGen/MIR/X86/tied-physical-regs-match.mir @@ -0,0 +1,22 @@ +# RUN: not llc -march=x86-64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s +# This test ensures that the Machine Verifier detects tied physical registers +# that doesn't match. + +--- | + + define i32 @foo() { + entry: + ret i32 0 + } + +... +--- +name: foo +body: | + bb.0.entry: + liveins: %rdi + + ; CHECK: Tied physical registers must match. + %rbx = AND64rm killed %rdx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags + RETQ %rbx +... diff --git a/test/CodeGen/MSP430/Inst16mm.ll b/test/CodeGen/MSP430/Inst16mm.ll index 951002d60a037..14a799b91717d 100644 --- a/test/CodeGen/MSP430/Inst16mm.ll +++ b/test/CodeGen/MSP430/Inst16mm.ll @@ -64,6 +64,6 @@ entry: %0 = load i16, i16* %retval ; <i16> [#uses=1] ret i16 %0 ; CHECK-LABEL: mov2: -; CHECK: mov.w 0(r1), 4(r1) -; CHECK: mov.w 2(r1), 6(r1) +; CHECK-DAG: mov.w 2(r1), 6(r1) +; CHECK-DAG: mov.w 0(r1), 4(r1) } diff --git a/test/CodeGen/NVPTX/lower-aggr-copies.ll b/test/CodeGen/NVPTX/lower-aggr-copies.ll index f522c6722ee6f..4298442157e23 100644 --- a/test/CodeGen/NVPTX/lower-aggr-copies.ll +++ b/test/CodeGen/NVPTX/lower-aggr-copies.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX ; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR +; RUN: opt < %s -S -nvptx-lower-aggr-copies -use-wide-memcpy-loop-lowering=true | FileCheck %s --check-prefix WIR ; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to ; llvm.mem* intrinsics get lowered to loops. @@ -32,6 +33,23 @@ entry: ; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 ; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd ; PTX: @%p[[PRED]] bra LBB[[LABEL]] + +; WIR-LABEL: @memcpy_caller +; WIR: entry: +; WIR: [[LoopCount:%[0-9]+]] = udiv i64 %n, 1 +; WIR: [[ResidualSize:%[0-9]+]] = urem i64 %n, 1 +; WIR: [[Cond:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0 +; WIR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion + +; WIR: loop-memcpy-expansion: +; WIR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ] +; WIR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index +; WIR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]] +; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index +; WIR: store i8 [[Load]], i8* [[DstGep]] +; WIR: [[IndexInc]] = add i64 %loop-index, 1 +; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]] +; WIR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion } define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 { @@ -50,6 +68,23 @@ entry: ; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 ; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd ; PTX: @%p[[PRED]] bra LBB[[LABEL]] + +; WIR-LABEL: @memcpy_volatile_caller +; WIR: entry: +; WIR: [[LoopCount:%[0-9]+]] = udiv i64 %n, 1 +; WIR: [[ResidualSize:%[0-9]+]] = urem i64 %n, 1 +; WIR: [[Cond:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0 +; WIR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion + +; WIR: loop-memcpy-expansion: +; WIR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ] +; WIR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index +; WIR: [[Load:%[0-9]+]] = load volatile i8, i8* [[SrcGep]] +; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index +; WIR: store volatile i8 [[Load]], i8* [[DstGep]] +; WIR: [[IndexInc]] = add i64 %loop-index, 1 +; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]] +; WIR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion } define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 { @@ -65,6 +100,32 @@ entry: ; IR: [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8* ; IR: getelementptr inbounds i8, i8* [[SRCCAST]] ; IR: getelementptr inbounds i8, i8* [[DSTCAST]] + +; WIR-LABEL: @memcpy_casting_caller +; WIR: [[DSTCAST:%[0-9]+]] = bitcast i32* %dst to i8* +; WIR: [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8* +; WIR: getelementptr inbounds i8, i8* [[SRCCAST]] +; WIR: getelementptr inbounds i8, i8* [[DSTCAST]] +} + +define i8* @memcpy_known_size(i8* %dst, i8* %src) { +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 144, i32 1, i1 false) + ret i8* %dst + +; Check that calls with compile-time constant size are handled correctly +; WIR-LABEL: @memcpy_known_size +; WIR: entry: +; WIR: br label %load-store-loop +; WIR: load-store-loop: +; WIR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ] +; WIR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index +; WIR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]] +; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index +; WIR: store i8 [[Load]], i8* [[DstGep]] +; WIR: [[IndexInc]] = add i64 %loop-index, 1 +; WIR: [[Cond:%[0-9]+]] = icmp ult i64 %3, 144 +; WIR: br i1 [[Cond]], label %load-store-loop, label %memcpy-split } define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 { diff --git a/test/CodeGen/PowerPC/PR33636.ll b/test/CodeGen/PowerPC/PR33636.ll new file mode 100644 index 0000000000000..4a1216dd4c113 --- /dev/null +++ b/test/CodeGen/PowerPC/PR33636.ll @@ -0,0 +1,702 @@ +; Just a test case for a crash reported in +; https://bugs.llvm.org/show_bug.cgi?id=33636 +; RUN: llc -mtriple=powerpc64le-unknown-unknown -mcpu=pwr8 < %s | FileCheck %s +@g_225 = external unnamed_addr global i16, align 2 +@g_756 = external global [6 x i32], align 4 +@g_3456 = external global i32, align 4 +@g_3708 = external global [9 x i32], align 4 +@g_1252 = external global i8*, align 8 +@g_3043 = external global float*, align 8 + +; Function Attrs: nounwind +define void @main() { + br i1 undef, label %1, label %4 + +; <label>:1: ; preds = %0 + br i1 undef, label %2, label %3 + +; <label>:2: ; preds = %1 + br label %3 + +; <label>:3: ; preds = %2, %1 + br label %4 + +; <label>:4: ; preds = %3, %0 + br label %5 + +; <label>:5: ; preds = %5, %4 + br i1 undef, label %6, label %5 + +; <label>:6: ; preds = %5 + br i1 undef, label %7, label %8 + +; <label>:7: ; preds = %6 + br i1 undef, label %70, label %69 + +; <label>:8: ; preds = %6 + br i1 undef, label %9, label %50 + +; <label>:9: ; preds = %8 + br label %11 + +; <label>:10: ; preds = %28 + br i1 undef, label %11, label %12 + +; <label>:11: ; preds = %10, %9 + br label %13 + +; <label>:12: ; preds = %10 + br label %30 + +; <label>:13: ; preds = %23, %11 + br i1 undef, label %17, label %14 + +; <label>:14: ; preds = %13 + br i1 undef, label %16, label %15 + +; <label>:15: ; preds = %14 + br label %22 + +; <label>:16: ; preds = %14 + br label %17 + +; <label>:17: ; preds = %16, %13 + br i1 undef, label %18, label %19 + +; <label>:18: ; preds = %17 + br label %19 + +; <label>:19: ; preds = %18, %17 + br i1 undef, label %48, label %20 + +; <label>:20: ; preds = %19 + br i1 undef, label %48, label %21 + +; <label>:21: ; preds = %20 + br label %22 + +; <label>:22: ; preds = %21, %15 + br i1 undef, label %23, label %24 + +; <label>:23: ; preds = %22 + br label %13 + +; <label>:24: ; preds = %22 + br i1 undef, label %28, label %25 + +; <label>:25: ; preds = %24 + br label %26 + +; <label>:26: ; preds = %26, %25 + br i1 undef, label %26, label %27 + +; <label>:27: ; preds = %26 + br label %48 + +; <label>:28: ; preds = %24 + br i1 undef, label %29, label %10 + +; <label>:29: ; preds = %28 + br label %48 + +; <label>:30: ; preds = %33, %12 + br i1 undef, label %32, label %33 + +; <label>:31: ; preds = %33 + br label %34 + +; <label>:32: ; preds = %30 + br label %33 + +; <label>:33: ; preds = %32, %30 + br i1 undef, label %30, label %31 + +; <label>:34: ; preds = %47, %31 + br i1 undef, label %35, label %36 + +; <label>:35: ; preds = %34 + br label %36 + +; <label>:36: ; preds = %35, %34 + br label %37 + +; <label>:37: ; preds = %45, %36 + br i1 undef, label %40, label %38 + +; <label>:38: ; preds = %37 + br i1 undef, label %39, label %46 + +; <label>:39: ; preds = %38 + br label %41 + +; <label>:40: ; preds = %37 + br label %41 + +; <label>:41: ; preds = %40, %39 + br label %42 + +; <label>:42: ; preds = %44, %41 + br i1 undef, label %43, label %44 + +; <label>:43: ; preds = %42 + br label %44 + +; <label>:44: ; preds = %43, %42 + br i1 undef, label %42, label %45 + +; <label>:45: ; preds = %44 + br i1 undef, label %37, label %47 + +; <label>:46: ; preds = %38 + br label %48 + +; <label>:47: ; preds = %45 + br i1 undef, label %34, label %49 + +; <label>:48: ; preds = %46, %29, %27, %20, %19 + br label %65 + +; <label>:49: ; preds = %47 + br label %58 + +; <label>:50: ; preds = %8 + br i1 undef, label %52, label %51 + +; <label>:51: ; preds = %50 + br label %57 + +; <label>:52: ; preds = %50 + br label %53 + +; <label>:53: ; preds = %56, %52 + br i1 undef, label %54, label %59 + +; <label>:54: ; preds = %53 + br i1 undef, label %60, label %59 + +; <label>:55: ; preds = %64 + br label %56 + +; <label>:56: ; preds = %64, %55 + br i1 undef, label %57, label %53 + +; <label>:57: ; preds = %56, %51 + br label %58 + +; <label>:58: ; preds = %57, %49 + br label %65 + +; <label>:59: ; preds = %63, %62, %61, %60, %54, %53 + br label %65 + +; <label>:60: ; preds = %54 + br i1 undef, label %61, label %59 + +; <label>:61: ; preds = %60 + br i1 undef, label %62, label %59 + +; <label>:62: ; preds = %61 + br i1 undef, label %63, label %59 + +; <label>:63: ; preds = %62 + br i1 undef, label %64, label %59 + +; <label>:64: ; preds = %63 + br i1 undef, label %55, label %56 + +; <label>:65: ; preds = %59, %58, %48 + br i1 undef, label %66, label %67 + +; <label>:66: ; preds = %65 + br label %67 + +; <label>:67: ; preds = %66, %65 + br i1 undef, label %68, label %92 + +; <label>:68: ; preds = %67 + br label %92 + +; <label>:69: ; preds = %7 + br label %70 + +; <label>:70: ; preds = %69, %7 + br i1 undef, label %72, label %71 + +; <label>:71: ; preds = %70 + br label %72 + +; <label>:72: ; preds = %71, %70 + br i1 undef, label %73, label %74 + +; <label>:73: ; preds = %72 + br label %74 + +; <label>:74: ; preds = %73, %72 + br i1 undef, label %85, label %75 + +; <label>:75: ; preds = %74 + br i1 undef, label %84, label %76 + +; <label>:76: ; preds = %75 + br i1 undef, label %78, label %77 + +; <label>:77: ; preds = %77, %76 + br i1 undef, label %84, label %77 + +; <label>:78: ; preds = %76 + br label %79 + +; <label>:79: ; preds = %83, %78 + br i1 undef, label %83, label %80 + +; <label>:80: ; preds = %79 + br i1 undef, label %81, label %82 + +; <label>:81: ; preds = %80 + br label %83 + +; <label>:82: ; preds = %80 + br label %83 + +; <label>:83: ; preds = %82, %81, %79 + br i1 undef, label %90, label %79 + +; <label>:84: ; preds = %77, %75 + br label %92 + +; <label>:85: ; preds = %74 + br i1 undef, label %86, label %88 + +; <label>:86: ; preds = %85 + br i1 undef, label %89, label %87 + +; <label>:87: ; preds = %86 + br i1 undef, label %89, label %88 + +; <label>:88: ; preds = %87, %85 + br label %89 + +; <label>:89: ; preds = %88, %87, %86 + br label %92 + +; <label>:90: ; preds = %83 + br i1 undef, label %92, label %91 + +; <label>:91: ; preds = %90 + br label %92 + +; <label>:92: ; preds = %91, %90, %89, %84, %68, %67 + br label %93 + +; <label>:93: ; preds = %100, %92 + br label %94 + +; <label>:94: ; preds = %98, %93 + br label %95 + +; <label>:95: ; preds = %97, %94 + br i1 undef, label %96, label %97 + +; <label>:96: ; preds = %95 + br label %97 + +; <label>:97: ; preds = %96, %95 + br i1 undef, label %95, label %98 + +; <label>:98: ; preds = %97 + store i32 7, i32* getelementptr inbounds ([9 x i32], [9 x i32]* @g_3708, i64 0, i64 7), align 4 + %99 = load volatile i32, i32* @g_3456, align 4 + br i1 undef, label %94, label %100 + +; <label>:100: ; preds = %98 + br i1 undef, label %93, label %101 + +; <label>:101: ; preds = %100 + br label %102 + +; <label>:102: ; preds = %117, %101 + br label %103 + +; <label>:103: ; preds = %109, %102 + store i8** @g_1252, i8*** undef, align 8 + br i1 undef, label %105, label %104 + +; <label>:104: ; preds = %103 + br label %105 + +; <label>:105: ; preds = %104, %103 + %106 = icmp eq i32 0, 0 + br i1 %106, label %107, label %116 + +; <label>:107: ; preds = %105 + br i1 icmp ne (i32* getelementptr inbounds ([6 x i32], [6 x i32]* @g_756, i64 0, i64 0), i32* getelementptr inbounds ([9 x i32], [9 x i32]* @g_3708, i64 0, i64 4)), label %109, label %108 + +; <label>:108: ; preds = %107 + br label %109 + +; <label>:109: ; preds = %108, %107 + %110 = phi i32 [ sdiv (i32 32, i32 zext (i1 icmp eq (i32* getelementptr inbounds ([6 x i32], [6 x i32]* @g_756, i64 0, i64 0), i32* getelementptr inbounds ([9 x i32], [9 x i32]* @g_3708, i64 0, i64 4)) to i32)), %108 ], [ 32, %107 ] + %111 = trunc i32 %110 to i8 + %112 = icmp ne i8 %111, 0 + %113 = and i1 %112, icmp eq (i32* getelementptr inbounds ([6 x i32], [6 x i32]* @g_756, i64 0, i64 0), i32* getelementptr inbounds ([9 x i32], [9 x i32]* @g_3708, i64 0, i64 4)) + %114 = zext i1 %113 to i16 + store i16 %114, i16* @g_225, align 2 + %115 = load volatile float*, float** @g_3043, align 8 + br i1 undef, label %103, label %117 + +; <label>:116: ; preds = %105 + br label %119 + +; <label>:117: ; preds = %109 + br i1 undef, label %102, label %118 + +; <label>:118: ; preds = %117 + br label %119 + +; <label>:119: ; preds = %118, %116 + br i1 undef, label %120, label %231 + +; <label>:120: ; preds = %119 + br label %232 + +; <label>:121: ; preds = %230 + br label %122 + +; <label>:122: ; preds = %230, %121 + br i1 undef, label %124, label %123 + +; <label>:123: ; preds = %122 + br label %124 + +; <label>:124: ; preds = %123, %122 + br i1 undef, label %228, label %225 + +; <label>:125: ; preds = %218 + br label %127 + +; <label>:126: ; preds = %218 + br label %127 + +; <label>:127: ; preds = %216, %126, %125 + br i1 undef, label %204, label %128 + +; <label>:128: ; preds = %127 + br label %205 + +; <label>:129: ; preds = %216 + br i1 undef, label %131, label %130 + +; <label>:130: ; preds = %129 + br label %131 + +; <label>:131: ; preds = %130, %129 + br i1 undef, label %133, label %132 + +; <label>:132: ; preds = %131 + br label %133 + +; <label>:133: ; preds = %132, %131 + br label %134 + +; <label>:134: ; preds = %203, %133 + br i1 undef, label %193, label %135 + +; <label>:135: ; preds = %134 + br label %194 + +; <label>:136: ; preds = %203 + br i1 undef, label %138, label %137 + +; <label>:137: ; preds = %136 + br label %138 + +; <label>:138: ; preds = %137, %136 + br i1 undef, label %192, label %139 + +; <label>:139: ; preds = %138 + br label %191 + +; <label>:140: ; preds = %191, %190 + br i1 undef, label %180, label %141 + +; <label>:141: ; preds = %140 + br label %181 + +; <label>:142: ; preds = %190 + br i1 undef, label %143, label %178 + +; <label>:143: ; preds = %142 + br label %179 + +; <label>:144: ; preds = %179 + br label %176 + +; <label>:145: ; preds = %179 + br label %176 + +; <label>:146: ; preds = %177, %175, %174 + br i1 undef, label %165, label %147 + +; <label>:147: ; preds = %146 + br label %166 + +; <label>:148: ; preds = %174 + br label %149 + +; <label>:149: ; preds = %164, %148 + br i1 undef, label %154, label %150 + +; <label>:150: ; preds = %149 + br label %155 + +; <label>:151: ; preds = %164 + br i1 undef, label %153, label %152 + +; <label>:152: ; preds = %151 + br label %153 + +; <label>:153: ; preds = %152, %151 + ret void + +; <label>:154: ; preds = %149 + br label %155 + +; <label>:155: ; preds = %154, %150 + br i1 undef, label %157, label %156 + +; <label>:156: ; preds = %155 + br label %158 + +; <label>:157: ; preds = %155 + br label %158 + +; <label>:158: ; preds = %157, %156 + br i1 undef, label %160, label %159 + +; <label>:159: ; preds = %158 + br label %161 + +; <label>:160: ; preds = %158 + br label %161 + +; <label>:161: ; preds = %160, %159 + br i1 undef, label %163, label %162 + +; <label>:162: ; preds = %161 + br label %164 + +; <label>:163: ; preds = %161 + br label %164 + +; <label>:164: ; preds = %163, %162 + br i1 undef, label %151, label %149 + +; <label>:165: ; preds = %146 + br label %166 + +; <label>:166: ; preds = %165, %147 + br i1 undef, label %168, label %167 + +; <label>:167: ; preds = %166 + br label %169 + +; <label>:168: ; preds = %166 + br label %169 + +; <label>:169: ; preds = %168, %167 + br i1 undef, label %171, label %170 + +; <label>:170: ; preds = %169 + br label %172 + +; <label>:171: ; preds = %169 + br label %172 + +; <label>:172: ; preds = %171, %170 + br i1 undef, label %174, label %173 + +; <label>:173: ; preds = %172 + br label %174 + +; <label>:174: ; preds = %173, %172 + br i1 undef, label %148, label %146 + +; <label>:175: ; preds = %176 + br label %146 + +; <label>:176: ; preds = %145, %144 + br i1 undef, label %177, label %175 + +; <label>:177: ; preds = %176 + br label %146 + +; <label>:178: ; preds = %142 + br label %179 + +; <label>:179: ; preds = %178, %143 + br i1 undef, label %145, label %144 + +; <label>:180: ; preds = %140 + br label %181 + +; <label>:181: ; preds = %180, %141 + br i1 undef, label %183, label %182 + +; <label>:182: ; preds = %181 + br label %184 + +; <label>:183: ; preds = %181 + br label %184 + +; <label>:184: ; preds = %183, %182 + br i1 undef, label %186, label %185 + +; <label>:185: ; preds = %184 + br label %187 + +; <label>:186: ; preds = %184 + br label %187 + +; <label>:187: ; preds = %186, %185 + br i1 undef, label %189, label %188 + +; <label>:188: ; preds = %187 + br label %190 + +; <label>:189: ; preds = %187 + br label %190 + +; <label>:190: ; preds = %189, %188 + br i1 undef, label %142, label %140 + +; <label>:191: ; preds = %192, %139 + br label %140 + +; <label>:192: ; preds = %138 + br label %191 + +; <label>:193: ; preds = %134 + br label %194 + +; <label>:194: ; preds = %193, %135 + br i1 undef, label %196, label %195 + +; <label>:195: ; preds = %194 + br label %197 + +; <label>:196: ; preds = %194 + br label %197 + +; <label>:197: ; preds = %196, %195 + br i1 undef, label %199, label %198 + +; <label>:198: ; preds = %197 + br label %200 + +; <label>:199: ; preds = %197 + br label %200 + +; <label>:200: ; preds = %199, %198 + br i1 undef, label %202, label %201 + +; <label>:201: ; preds = %200 + br label %203 + +; <label>:202: ; preds = %200 + br label %203 + +; <label>:203: ; preds = %202, %201 + br i1 undef, label %136, label %134 + +; <label>:204: ; preds = %127 + br label %205 + +; <label>:205: ; preds = %204, %128 + br i1 undef, label %207, label %206 + +; <label>:206: ; preds = %205 + br label %208 + +; <label>:207: ; preds = %205 + br label %208 + +; <label>:208: ; preds = %207, %206 + br i1 undef, label %210, label %209 + +; <label>:209: ; preds = %208 + br label %211 + +; <label>:210: ; preds = %208 + br label %211 + +; <label>:211: ; preds = %210, %209 + br i1 undef, label %213, label %212 + +; <label>:212: ; preds = %211 + br label %214 + +; <label>:213: ; preds = %211 + br label %214 + +; <label>:214: ; preds = %213, %212 + br i1 undef, label %216, label %215 + +; <label>:215: ; preds = %214 + br label %216 + +; <label>:216: ; preds = %215, %214 + br i1 undef, label %129, label %127 + +; <label>:217: ; preds = %220 + br label %218 + +; <label>:218: ; preds = %221, %217 + br i1 undef, label %126, label %125 + +; <label>:219: ; preds = %223 + br label %220 + +; <label>:220: ; preds = %224, %219 + br i1 undef, label %221, label %217 + +; <label>:221: ; preds = %220 + br label %218 + +; <label>:222: ; preds = %226 + br label %223 + +; <label>:223: ; preds = %227, %222 + br i1 undef, label %224, label %219 + +; <label>:224: ; preds = %223 + br label %220 + +; <label>:225: ; preds = %124 + br label %226 + +; <label>:226: ; preds = %228, %225 + br i1 undef, label %227, label %222 + +; <label>:227: ; preds = %226 + br label %223 + +; <label>:228: ; preds = %124 + br label %226 + +; <label>:229: ; preds = %232 + br label %230 + +; <label>:230: ; preds = %233, %229 + br i1 undef, label %122, label %121 + +; <label>:231: ; preds = %119 + br label %232 + +; <label>:232: ; preds = %231, %120 + br i1 undef, label %233, label %229 + +; <label>:233: ; preds = %232 + br label %230 + +; CHECK: blr +} diff --git a/test/CodeGen/PowerPC/atomics-regression.ll b/test/CodeGen/PowerPC/atomics-regression.ll index d57b3a203791c..0c7a31d16b199 100644 --- a/test/CodeGen/PowerPC/atomics-regression.ll +++ b/test/CodeGen/PowerPC/atomics-regression.ll @@ -370,7 +370,7 @@ define void @test36() { ; PPC64LE: # BB#0: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - fence singlethread acquire + fence syncscope("singlethread") acquire ret void } @@ -379,7 +379,7 @@ define void @test37() { ; PPC64LE: # BB#0: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - fence singlethread release + fence syncscope("singlethread") release ret void } @@ -388,7 +388,7 @@ define void @test38() { ; PPC64LE: # BB#0: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - fence singlethread acq_rel + fence syncscope("singlethread") acq_rel ret void } @@ -397,7 +397,7 @@ define void @test39() { ; PPC64LE: # BB#0: ; PPC64LE-NEXT: sync ; PPC64LE-NEXT: blr - fence singlethread seq_cst + fence syncscope("singlethread") seq_cst ret void } @@ -1273,7 +1273,7 @@ define void @test80(i8* %ptr, i8 %cmp, i8 %val) { ; PPC64LE-NEXT: # BB#3: ; PPC64LE-NEXT: stbcx. 6, 0, 3 ; PPC64LE-NEXT: blr - %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread monotonic monotonic + %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") monotonic monotonic ret void } @@ -1294,7 +1294,7 @@ define void @test81(i8* %ptr, i8 %cmp, i8 %val) { ; PPC64LE-NEXT: stbcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acquire monotonic + %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acquire monotonic ret void } @@ -1315,7 +1315,7 @@ define void @test82(i8* %ptr, i8 %cmp, i8 %val) { ; PPC64LE-NEXT: stbcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acquire acquire + %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acquire acquire ret void } @@ -1336,7 +1336,7 @@ define void @test83(i8* %ptr, i8 %cmp, i8 %val) { ; PPC64LE-NEXT: # BB#3: ; PPC64LE-NEXT: stbcx. 6, 0, 3 ; PPC64LE-NEXT: blr - %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread release monotonic + %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") release monotonic ret void } @@ -1357,7 +1357,7 @@ define void @test84(i8* %ptr, i8 %cmp, i8 %val) { ; PPC64LE-NEXT: # BB#3: ; PPC64LE-NEXT: stbcx. 6, 0, 3 ; PPC64LE-NEXT: blr - %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread release acquire + %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") release acquire ret void } @@ -1379,7 +1379,7 @@ define void @test85(i8* %ptr, i8 %cmp, i8 %val) { ; PPC64LE-NEXT: stbcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acq_rel monotonic + %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acq_rel monotonic ret void } @@ -1401,7 +1401,7 @@ define void @test86(i8* %ptr, i8 %cmp, i8 %val) { ; PPC64LE-NEXT: stbcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acq_rel acquire + %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") acq_rel acquire ret void } @@ -1423,7 +1423,7 @@ define void @test87(i8* %ptr, i8 %cmp, i8 %val) { ; PPC64LE-NEXT: stbcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread seq_cst monotonic + %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst monotonic ret void } @@ -1445,7 +1445,7 @@ define void @test88(i8* %ptr, i8 %cmp, i8 %val) { ; PPC64LE-NEXT: stbcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread seq_cst acquire + %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst acquire ret void } @@ -1467,7 +1467,7 @@ define void @test89(i8* %ptr, i8 %cmp, i8 %val) { ; PPC64LE-NEXT: stbcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread seq_cst seq_cst + %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst seq_cst ret void } @@ -1487,7 +1487,7 @@ define void @test90(i16* %ptr, i16 %cmp, i16 %val) { ; PPC64LE-NEXT: # BB#3: ; PPC64LE-NEXT: sthcx. 6, 0, 3 ; PPC64LE-NEXT: blr - %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread monotonic monotonic + %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") monotonic monotonic ret void } @@ -1508,7 +1508,7 @@ define void @test91(i16* %ptr, i16 %cmp, i16 %val) { ; PPC64LE-NEXT: sthcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acquire monotonic + %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acquire monotonic ret void } @@ -1529,7 +1529,7 @@ define void @test92(i16* %ptr, i16 %cmp, i16 %val) { ; PPC64LE-NEXT: sthcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acquire acquire + %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acquire acquire ret void } @@ -1550,7 +1550,7 @@ define void @test93(i16* %ptr, i16 %cmp, i16 %val) { ; PPC64LE-NEXT: # BB#3: ; PPC64LE-NEXT: sthcx. 6, 0, 3 ; PPC64LE-NEXT: blr - %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread release monotonic + %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") release monotonic ret void } @@ -1571,7 +1571,7 @@ define void @test94(i16* %ptr, i16 %cmp, i16 %val) { ; PPC64LE-NEXT: # BB#3: ; PPC64LE-NEXT: sthcx. 6, 0, 3 ; PPC64LE-NEXT: blr - %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread release acquire + %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") release acquire ret void } @@ -1593,7 +1593,7 @@ define void @test95(i16* %ptr, i16 %cmp, i16 %val) { ; PPC64LE-NEXT: sthcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acq_rel monotonic + %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acq_rel monotonic ret void } @@ -1615,7 +1615,7 @@ define void @test96(i16* %ptr, i16 %cmp, i16 %val) { ; PPC64LE-NEXT: sthcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acq_rel acquire + %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") acq_rel acquire ret void } @@ -1637,7 +1637,7 @@ define void @test97(i16* %ptr, i16 %cmp, i16 %val) { ; PPC64LE-NEXT: sthcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread seq_cst monotonic + %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst monotonic ret void } @@ -1659,7 +1659,7 @@ define void @test98(i16* %ptr, i16 %cmp, i16 %val) { ; PPC64LE-NEXT: sthcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread seq_cst acquire + %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst acquire ret void } @@ -1681,7 +1681,7 @@ define void @test99(i16* %ptr, i16 %cmp, i16 %val) { ; PPC64LE-NEXT: sthcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread seq_cst seq_cst + %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst seq_cst ret void } @@ -1701,7 +1701,7 @@ define void @test100(i32* %ptr, i32 %cmp, i32 %val) { ; PPC64LE-NEXT: # BB#3: ; PPC64LE-NEXT: stwcx. 6, 0, 3 ; PPC64LE-NEXT: blr - %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread monotonic monotonic + %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") monotonic monotonic ret void } @@ -1722,7 +1722,7 @@ define void @test101(i32* %ptr, i32 %cmp, i32 %val) { ; PPC64LE-NEXT: stwcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acquire monotonic + %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acquire monotonic ret void } @@ -1743,7 +1743,7 @@ define void @test102(i32* %ptr, i32 %cmp, i32 %val) { ; PPC64LE-NEXT: stwcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acquire acquire + %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acquire acquire ret void } @@ -1764,7 +1764,7 @@ define void @test103(i32* %ptr, i32 %cmp, i32 %val) { ; PPC64LE-NEXT: # BB#3: ; PPC64LE-NEXT: stwcx. 6, 0, 3 ; PPC64LE-NEXT: blr - %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread release monotonic + %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") release monotonic ret void } @@ -1785,7 +1785,7 @@ define void @test104(i32* %ptr, i32 %cmp, i32 %val) { ; PPC64LE-NEXT: # BB#3: ; PPC64LE-NEXT: stwcx. 6, 0, 3 ; PPC64LE-NEXT: blr - %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread release acquire + %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") release acquire ret void } @@ -1807,7 +1807,7 @@ define void @test105(i32* %ptr, i32 %cmp, i32 %val) { ; PPC64LE-NEXT: stwcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acq_rel monotonic + %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acq_rel monotonic ret void } @@ -1829,7 +1829,7 @@ define void @test106(i32* %ptr, i32 %cmp, i32 %val) { ; PPC64LE-NEXT: stwcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acq_rel acquire + %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") acq_rel acquire ret void } @@ -1851,7 +1851,7 @@ define void @test107(i32* %ptr, i32 %cmp, i32 %val) { ; PPC64LE-NEXT: stwcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread seq_cst monotonic + %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst monotonic ret void } @@ -1873,7 +1873,7 @@ define void @test108(i32* %ptr, i32 %cmp, i32 %val) { ; PPC64LE-NEXT: stwcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread seq_cst acquire + %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst acquire ret void } @@ -1895,7 +1895,7 @@ define void @test109(i32* %ptr, i32 %cmp, i32 %val) { ; PPC64LE-NEXT: stwcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread seq_cst seq_cst + %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst seq_cst ret void } @@ -1915,7 +1915,7 @@ define void @test110(i64* %ptr, i64 %cmp, i64 %val) { ; PPC64LE-NEXT: # BB#3: ; PPC64LE-NEXT: stdcx. 6, 0, 3 ; PPC64LE-NEXT: blr - %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread monotonic monotonic + %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") monotonic monotonic ret void } @@ -1936,7 +1936,7 @@ define void @test111(i64* %ptr, i64 %cmp, i64 %val) { ; PPC64LE-NEXT: stdcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acquire monotonic + %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acquire monotonic ret void } @@ -1957,7 +1957,7 @@ define void @test112(i64* %ptr, i64 %cmp, i64 %val) { ; PPC64LE-NEXT: stdcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acquire acquire + %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acquire acquire ret void } @@ -1978,7 +1978,7 @@ define void @test113(i64* %ptr, i64 %cmp, i64 %val) { ; PPC64LE-NEXT: # BB#3: ; PPC64LE-NEXT: stdcx. 6, 0, 3 ; PPC64LE-NEXT: blr - %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread release monotonic + %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") release monotonic ret void } @@ -1999,7 +1999,7 @@ define void @test114(i64* %ptr, i64 %cmp, i64 %val) { ; PPC64LE-NEXT: # BB#3: ; PPC64LE-NEXT: stdcx. 6, 0, 3 ; PPC64LE-NEXT: blr - %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread release acquire + %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") release acquire ret void } @@ -2021,7 +2021,7 @@ define void @test115(i64* %ptr, i64 %cmp, i64 %val) { ; PPC64LE-NEXT: stdcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acq_rel monotonic + %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acq_rel monotonic ret void } @@ -2043,7 +2043,7 @@ define void @test116(i64* %ptr, i64 %cmp, i64 %val) { ; PPC64LE-NEXT: stdcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acq_rel acquire + %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") acq_rel acquire ret void } @@ -2065,7 +2065,7 @@ define void @test117(i64* %ptr, i64 %cmp, i64 %val) { ; PPC64LE-NEXT: stdcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread seq_cst monotonic + %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst monotonic ret void } @@ -2087,7 +2087,7 @@ define void @test118(i64* %ptr, i64 %cmp, i64 %val) { ; PPC64LE-NEXT: stdcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread seq_cst acquire + %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst acquire ret void } @@ -2109,7 +2109,7 @@ define void @test119(i64* %ptr, i64 %cmp, i64 %val) { ; PPC64LE-NEXT: stdcx. 6, 0, 3 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread seq_cst seq_cst + %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst seq_cst ret void } @@ -5847,7 +5847,7 @@ define i8 @test340(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread monotonic + %ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") monotonic ret i8 %ret } @@ -5862,7 +5862,7 @@ define i8 @test341(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread acquire + %ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") acquire ret i8 %ret } @@ -5877,7 +5877,7 @@ define i8 @test342(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread release + %ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") release ret i8 %ret } @@ -5893,7 +5893,7 @@ define i8 @test343(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread acq_rel + %ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") acq_rel ret i8 %ret } @@ -5909,7 +5909,7 @@ define i8 @test344(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread seq_cst + %ret = atomicrmw xchg i8* %ptr, i8 %val syncscope("singlethread") seq_cst ret i8 %ret } @@ -5923,7 +5923,7 @@ define i16 @test345(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread monotonic + %ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") monotonic ret i16 %ret } @@ -5938,7 +5938,7 @@ define i16 @test346(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread acquire + %ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") acquire ret i16 %ret } @@ -5953,7 +5953,7 @@ define i16 @test347(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread release + %ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") release ret i16 %ret } @@ -5969,7 +5969,7 @@ define i16 @test348(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread acq_rel + %ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") acq_rel ret i16 %ret } @@ -5985,7 +5985,7 @@ define i16 @test349(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread seq_cst + %ret = atomicrmw xchg i16* %ptr, i16 %val syncscope("singlethread") seq_cst ret i16 %ret } @@ -5999,7 +5999,7 @@ define i32 @test350(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread monotonic + %ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") monotonic ret i32 %ret } @@ -6014,7 +6014,7 @@ define i32 @test351(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread acquire + %ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") acquire ret i32 %ret } @@ -6029,7 +6029,7 @@ define i32 @test352(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread release + %ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") release ret i32 %ret } @@ -6045,7 +6045,7 @@ define i32 @test353(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread acq_rel + %ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") acq_rel ret i32 %ret } @@ -6061,7 +6061,7 @@ define i32 @test354(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread seq_cst + %ret = atomicrmw xchg i32* %ptr, i32 %val syncscope("singlethread") seq_cst ret i32 %ret } @@ -6075,7 +6075,7 @@ define i64 @test355(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread monotonic + %ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") monotonic ret i64 %ret } @@ -6090,7 +6090,7 @@ define i64 @test356(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread acquire + %ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") acquire ret i64 %ret } @@ -6105,7 +6105,7 @@ define i64 @test357(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread release + %ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") release ret i64 %ret } @@ -6121,7 +6121,7 @@ define i64 @test358(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread acq_rel + %ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") acq_rel ret i64 %ret } @@ -6137,7 +6137,7 @@ define i64 @test359(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread seq_cst + %ret = atomicrmw xchg i64* %ptr, i64 %val syncscope("singlethread") seq_cst ret i64 %ret } @@ -6152,7 +6152,7 @@ define i8 @test360(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw add i8* %ptr, i8 %val singlethread monotonic + %ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") monotonic ret i8 %ret } @@ -6168,7 +6168,7 @@ define i8 @test361(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw add i8* %ptr, i8 %val singlethread acquire + %ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") acquire ret i8 %ret } @@ -6184,7 +6184,7 @@ define i8 @test362(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw add i8* %ptr, i8 %val singlethread release + %ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") release ret i8 %ret } @@ -6201,7 +6201,7 @@ define i8 @test363(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw add i8* %ptr, i8 %val singlethread acq_rel + %ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") acq_rel ret i8 %ret } @@ -6218,7 +6218,7 @@ define i8 @test364(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw add i8* %ptr, i8 %val singlethread seq_cst + %ret = atomicrmw add i8* %ptr, i8 %val syncscope("singlethread") seq_cst ret i8 %ret } @@ -6233,7 +6233,7 @@ define i16 @test365(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw add i16* %ptr, i16 %val singlethread monotonic + %ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") monotonic ret i16 %ret } @@ -6249,7 +6249,7 @@ define i16 @test366(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw add i16* %ptr, i16 %val singlethread acquire + %ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") acquire ret i16 %ret } @@ -6265,7 +6265,7 @@ define i16 @test367(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw add i16* %ptr, i16 %val singlethread release + %ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") release ret i16 %ret } @@ -6282,7 +6282,7 @@ define i16 @test368(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw add i16* %ptr, i16 %val singlethread acq_rel + %ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") acq_rel ret i16 %ret } @@ -6299,7 +6299,7 @@ define i16 @test369(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw add i16* %ptr, i16 %val singlethread seq_cst + %ret = atomicrmw add i16* %ptr, i16 %val syncscope("singlethread") seq_cst ret i16 %ret } @@ -6314,7 +6314,7 @@ define i32 @test370(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw add i32* %ptr, i32 %val singlethread monotonic + %ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") monotonic ret i32 %ret } @@ -6330,7 +6330,7 @@ define i32 @test371(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw add i32* %ptr, i32 %val singlethread acquire + %ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") acquire ret i32 %ret } @@ -6346,7 +6346,7 @@ define i32 @test372(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw add i32* %ptr, i32 %val singlethread release + %ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") release ret i32 %ret } @@ -6363,7 +6363,7 @@ define i32 @test373(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw add i32* %ptr, i32 %val singlethread acq_rel + %ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") acq_rel ret i32 %ret } @@ -6380,7 +6380,7 @@ define i32 @test374(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw add i32* %ptr, i32 %val singlethread seq_cst + %ret = atomicrmw add i32* %ptr, i32 %val syncscope("singlethread") seq_cst ret i32 %ret } @@ -6395,7 +6395,7 @@ define i64 @test375(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw add i64* %ptr, i64 %val singlethread monotonic + %ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") monotonic ret i64 %ret } @@ -6411,7 +6411,7 @@ define i64 @test376(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw add i64* %ptr, i64 %val singlethread acquire + %ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") acquire ret i64 %ret } @@ -6427,7 +6427,7 @@ define i64 @test377(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw add i64* %ptr, i64 %val singlethread release + %ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") release ret i64 %ret } @@ -6444,7 +6444,7 @@ define i64 @test378(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw add i64* %ptr, i64 %val singlethread acq_rel + %ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") acq_rel ret i64 %ret } @@ -6461,7 +6461,7 @@ define i64 @test379(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw add i64* %ptr, i64 %val singlethread seq_cst + %ret = atomicrmw add i64* %ptr, i64 %val syncscope("singlethread") seq_cst ret i64 %ret } @@ -6476,7 +6476,7 @@ define i8 @test380(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i8* %ptr, i8 %val singlethread monotonic + %ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") monotonic ret i8 %ret } @@ -6492,7 +6492,7 @@ define i8 @test381(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i8* %ptr, i8 %val singlethread acquire + %ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") acquire ret i8 %ret } @@ -6508,7 +6508,7 @@ define i8 @test382(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i8* %ptr, i8 %val singlethread release + %ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") release ret i8 %ret } @@ -6525,7 +6525,7 @@ define i8 @test383(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i8* %ptr, i8 %val singlethread acq_rel + %ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") acq_rel ret i8 %ret } @@ -6542,7 +6542,7 @@ define i8 @test384(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i8* %ptr, i8 %val singlethread seq_cst + %ret = atomicrmw sub i8* %ptr, i8 %val syncscope("singlethread") seq_cst ret i8 %ret } @@ -6557,7 +6557,7 @@ define i16 @test385(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i16* %ptr, i16 %val singlethread monotonic + %ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") monotonic ret i16 %ret } @@ -6573,7 +6573,7 @@ define i16 @test386(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i16* %ptr, i16 %val singlethread acquire + %ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") acquire ret i16 %ret } @@ -6589,7 +6589,7 @@ define i16 @test387(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i16* %ptr, i16 %val singlethread release + %ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") release ret i16 %ret } @@ -6606,7 +6606,7 @@ define i16 @test388(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i16* %ptr, i16 %val singlethread acq_rel + %ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") acq_rel ret i16 %ret } @@ -6623,7 +6623,7 @@ define i16 @test389(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i16* %ptr, i16 %val singlethread seq_cst + %ret = atomicrmw sub i16* %ptr, i16 %val syncscope("singlethread") seq_cst ret i16 %ret } @@ -6638,7 +6638,7 @@ define i32 @test390(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i32* %ptr, i32 %val singlethread monotonic + %ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") monotonic ret i32 %ret } @@ -6654,7 +6654,7 @@ define i32 @test391(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i32* %ptr, i32 %val singlethread acquire + %ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") acquire ret i32 %ret } @@ -6670,7 +6670,7 @@ define i32 @test392(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i32* %ptr, i32 %val singlethread release + %ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") release ret i32 %ret } @@ -6687,7 +6687,7 @@ define i32 @test393(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i32* %ptr, i32 %val singlethread acq_rel + %ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") acq_rel ret i32 %ret } @@ -6704,7 +6704,7 @@ define i32 @test394(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i32* %ptr, i32 %val singlethread seq_cst + %ret = atomicrmw sub i32* %ptr, i32 %val syncscope("singlethread") seq_cst ret i32 %ret } @@ -6719,7 +6719,7 @@ define i64 @test395(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i64* %ptr, i64 %val singlethread monotonic + %ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") monotonic ret i64 %ret } @@ -6735,7 +6735,7 @@ define i64 @test396(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i64* %ptr, i64 %val singlethread acquire + %ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") acquire ret i64 %ret } @@ -6751,7 +6751,7 @@ define i64 @test397(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i64* %ptr, i64 %val singlethread release + %ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") release ret i64 %ret } @@ -6768,7 +6768,7 @@ define i64 @test398(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i64* %ptr, i64 %val singlethread acq_rel + %ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") acq_rel ret i64 %ret } @@ -6785,7 +6785,7 @@ define i64 @test399(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw sub i64* %ptr, i64 %val singlethread seq_cst + %ret = atomicrmw sub i64* %ptr, i64 %val syncscope("singlethread") seq_cst ret i64 %ret } @@ -6800,7 +6800,7 @@ define i8 @test400(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw and i8* %ptr, i8 %val singlethread monotonic + %ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") monotonic ret i8 %ret } @@ -6816,7 +6816,7 @@ define i8 @test401(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw and i8* %ptr, i8 %val singlethread acquire + %ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") acquire ret i8 %ret } @@ -6832,7 +6832,7 @@ define i8 @test402(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw and i8* %ptr, i8 %val singlethread release + %ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") release ret i8 %ret } @@ -6849,7 +6849,7 @@ define i8 @test403(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw and i8* %ptr, i8 %val singlethread acq_rel + %ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") acq_rel ret i8 %ret } @@ -6866,7 +6866,7 @@ define i8 @test404(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw and i8* %ptr, i8 %val singlethread seq_cst + %ret = atomicrmw and i8* %ptr, i8 %val syncscope("singlethread") seq_cst ret i8 %ret } @@ -6881,7 +6881,7 @@ define i16 @test405(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw and i16* %ptr, i16 %val singlethread monotonic + %ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") monotonic ret i16 %ret } @@ -6897,7 +6897,7 @@ define i16 @test406(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw and i16* %ptr, i16 %val singlethread acquire + %ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") acquire ret i16 %ret } @@ -6913,7 +6913,7 @@ define i16 @test407(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw and i16* %ptr, i16 %val singlethread release + %ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") release ret i16 %ret } @@ -6930,7 +6930,7 @@ define i16 @test408(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw and i16* %ptr, i16 %val singlethread acq_rel + %ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") acq_rel ret i16 %ret } @@ -6947,7 +6947,7 @@ define i16 @test409(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw and i16* %ptr, i16 %val singlethread seq_cst + %ret = atomicrmw and i16* %ptr, i16 %val syncscope("singlethread") seq_cst ret i16 %ret } @@ -6962,7 +6962,7 @@ define i32 @test410(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw and i32* %ptr, i32 %val singlethread monotonic + %ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") monotonic ret i32 %ret } @@ -6978,7 +6978,7 @@ define i32 @test411(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw and i32* %ptr, i32 %val singlethread acquire + %ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") acquire ret i32 %ret } @@ -6994,7 +6994,7 @@ define i32 @test412(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw and i32* %ptr, i32 %val singlethread release + %ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") release ret i32 %ret } @@ -7011,7 +7011,7 @@ define i32 @test413(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw and i32* %ptr, i32 %val singlethread acq_rel + %ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") acq_rel ret i32 %ret } @@ -7028,7 +7028,7 @@ define i32 @test414(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw and i32* %ptr, i32 %val singlethread seq_cst + %ret = atomicrmw and i32* %ptr, i32 %val syncscope("singlethread") seq_cst ret i32 %ret } @@ -7043,7 +7043,7 @@ define i64 @test415(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw and i64* %ptr, i64 %val singlethread monotonic + %ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") monotonic ret i64 %ret } @@ -7059,7 +7059,7 @@ define i64 @test416(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw and i64* %ptr, i64 %val singlethread acquire + %ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") acquire ret i64 %ret } @@ -7075,7 +7075,7 @@ define i64 @test417(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw and i64* %ptr, i64 %val singlethread release + %ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") release ret i64 %ret } @@ -7092,7 +7092,7 @@ define i64 @test418(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw and i64* %ptr, i64 %val singlethread acq_rel + %ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") acq_rel ret i64 %ret } @@ -7109,7 +7109,7 @@ define i64 @test419(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw and i64* %ptr, i64 %val singlethread seq_cst + %ret = atomicrmw and i64* %ptr, i64 %val syncscope("singlethread") seq_cst ret i64 %ret } @@ -7124,7 +7124,7 @@ define i8 @test420(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i8* %ptr, i8 %val singlethread monotonic + %ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") monotonic ret i8 %ret } @@ -7140,7 +7140,7 @@ define i8 @test421(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i8* %ptr, i8 %val singlethread acquire + %ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") acquire ret i8 %ret } @@ -7156,7 +7156,7 @@ define i8 @test422(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i8* %ptr, i8 %val singlethread release + %ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") release ret i8 %ret } @@ -7173,7 +7173,7 @@ define i8 @test423(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i8* %ptr, i8 %val singlethread acq_rel + %ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") acq_rel ret i8 %ret } @@ -7190,7 +7190,7 @@ define i8 @test424(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i8* %ptr, i8 %val singlethread seq_cst + %ret = atomicrmw nand i8* %ptr, i8 %val syncscope("singlethread") seq_cst ret i8 %ret } @@ -7205,7 +7205,7 @@ define i16 @test425(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i16* %ptr, i16 %val singlethread monotonic + %ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") monotonic ret i16 %ret } @@ -7221,7 +7221,7 @@ define i16 @test426(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i16* %ptr, i16 %val singlethread acquire + %ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") acquire ret i16 %ret } @@ -7237,7 +7237,7 @@ define i16 @test427(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i16* %ptr, i16 %val singlethread release + %ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") release ret i16 %ret } @@ -7254,7 +7254,7 @@ define i16 @test428(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i16* %ptr, i16 %val singlethread acq_rel + %ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") acq_rel ret i16 %ret } @@ -7271,7 +7271,7 @@ define i16 @test429(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i16* %ptr, i16 %val singlethread seq_cst + %ret = atomicrmw nand i16* %ptr, i16 %val syncscope("singlethread") seq_cst ret i16 %ret } @@ -7286,7 +7286,7 @@ define i32 @test430(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i32* %ptr, i32 %val singlethread monotonic + %ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") monotonic ret i32 %ret } @@ -7302,7 +7302,7 @@ define i32 @test431(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i32* %ptr, i32 %val singlethread acquire + %ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") acquire ret i32 %ret } @@ -7318,7 +7318,7 @@ define i32 @test432(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i32* %ptr, i32 %val singlethread release + %ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") release ret i32 %ret } @@ -7335,7 +7335,7 @@ define i32 @test433(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i32* %ptr, i32 %val singlethread acq_rel + %ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") acq_rel ret i32 %ret } @@ -7352,7 +7352,7 @@ define i32 @test434(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i32* %ptr, i32 %val singlethread seq_cst + %ret = atomicrmw nand i32* %ptr, i32 %val syncscope("singlethread") seq_cst ret i32 %ret } @@ -7367,7 +7367,7 @@ define i64 @test435(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i64* %ptr, i64 %val singlethread monotonic + %ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") monotonic ret i64 %ret } @@ -7383,7 +7383,7 @@ define i64 @test436(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i64* %ptr, i64 %val singlethread acquire + %ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") acquire ret i64 %ret } @@ -7399,7 +7399,7 @@ define i64 @test437(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i64* %ptr, i64 %val singlethread release + %ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") release ret i64 %ret } @@ -7416,7 +7416,7 @@ define i64 @test438(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i64* %ptr, i64 %val singlethread acq_rel + %ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") acq_rel ret i64 %ret } @@ -7433,7 +7433,7 @@ define i64 @test439(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw nand i64* %ptr, i64 %val singlethread seq_cst + %ret = atomicrmw nand i64* %ptr, i64 %val syncscope("singlethread") seq_cst ret i64 %ret } @@ -7448,7 +7448,7 @@ define i8 @test440(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw or i8* %ptr, i8 %val singlethread monotonic + %ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") monotonic ret i8 %ret } @@ -7464,7 +7464,7 @@ define i8 @test441(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw or i8* %ptr, i8 %val singlethread acquire + %ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") acquire ret i8 %ret } @@ -7480,7 +7480,7 @@ define i8 @test442(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw or i8* %ptr, i8 %val singlethread release + %ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") release ret i8 %ret } @@ -7497,7 +7497,7 @@ define i8 @test443(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw or i8* %ptr, i8 %val singlethread acq_rel + %ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") acq_rel ret i8 %ret } @@ -7514,7 +7514,7 @@ define i8 @test444(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw or i8* %ptr, i8 %val singlethread seq_cst + %ret = atomicrmw or i8* %ptr, i8 %val syncscope("singlethread") seq_cst ret i8 %ret } @@ -7529,7 +7529,7 @@ define i16 @test445(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw or i16* %ptr, i16 %val singlethread monotonic + %ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") monotonic ret i16 %ret } @@ -7545,7 +7545,7 @@ define i16 @test446(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw or i16* %ptr, i16 %val singlethread acquire + %ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") acquire ret i16 %ret } @@ -7561,7 +7561,7 @@ define i16 @test447(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw or i16* %ptr, i16 %val singlethread release + %ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") release ret i16 %ret } @@ -7578,7 +7578,7 @@ define i16 @test448(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw or i16* %ptr, i16 %val singlethread acq_rel + %ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") acq_rel ret i16 %ret } @@ -7595,7 +7595,7 @@ define i16 @test449(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw or i16* %ptr, i16 %val singlethread seq_cst + %ret = atomicrmw or i16* %ptr, i16 %val syncscope("singlethread") seq_cst ret i16 %ret } @@ -7610,7 +7610,7 @@ define i32 @test450(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw or i32* %ptr, i32 %val singlethread monotonic + %ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") monotonic ret i32 %ret } @@ -7626,7 +7626,7 @@ define i32 @test451(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw or i32* %ptr, i32 %val singlethread acquire + %ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") acquire ret i32 %ret } @@ -7642,7 +7642,7 @@ define i32 @test452(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw or i32* %ptr, i32 %val singlethread release + %ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") release ret i32 %ret } @@ -7659,7 +7659,7 @@ define i32 @test453(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw or i32* %ptr, i32 %val singlethread acq_rel + %ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") acq_rel ret i32 %ret } @@ -7676,7 +7676,7 @@ define i32 @test454(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw or i32* %ptr, i32 %val singlethread seq_cst + %ret = atomicrmw or i32* %ptr, i32 %val syncscope("singlethread") seq_cst ret i32 %ret } @@ -7691,7 +7691,7 @@ define i64 @test455(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw or i64* %ptr, i64 %val singlethread monotonic + %ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") monotonic ret i64 %ret } @@ -7707,7 +7707,7 @@ define i64 @test456(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw or i64* %ptr, i64 %val singlethread acquire + %ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") acquire ret i64 %ret } @@ -7723,7 +7723,7 @@ define i64 @test457(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw or i64* %ptr, i64 %val singlethread release + %ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") release ret i64 %ret } @@ -7740,7 +7740,7 @@ define i64 @test458(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw or i64* %ptr, i64 %val singlethread acq_rel + %ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") acq_rel ret i64 %ret } @@ -7757,7 +7757,7 @@ define i64 @test459(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw or i64* %ptr, i64 %val singlethread seq_cst + %ret = atomicrmw or i64* %ptr, i64 %val syncscope("singlethread") seq_cst ret i64 %ret } @@ -7772,7 +7772,7 @@ define i8 @test460(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i8* %ptr, i8 %val singlethread monotonic + %ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") monotonic ret i8 %ret } @@ -7788,7 +7788,7 @@ define i8 @test461(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i8* %ptr, i8 %val singlethread acquire + %ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") acquire ret i8 %ret } @@ -7804,7 +7804,7 @@ define i8 @test462(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i8* %ptr, i8 %val singlethread release + %ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") release ret i8 %ret } @@ -7821,7 +7821,7 @@ define i8 @test463(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i8* %ptr, i8 %val singlethread acq_rel + %ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") acq_rel ret i8 %ret } @@ -7838,7 +7838,7 @@ define i8 @test464(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i8* %ptr, i8 %val singlethread seq_cst + %ret = atomicrmw xor i8* %ptr, i8 %val syncscope("singlethread") seq_cst ret i8 %ret } @@ -7853,7 +7853,7 @@ define i16 @test465(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i16* %ptr, i16 %val singlethread monotonic + %ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") monotonic ret i16 %ret } @@ -7869,7 +7869,7 @@ define i16 @test466(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i16* %ptr, i16 %val singlethread acquire + %ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") acquire ret i16 %ret } @@ -7885,7 +7885,7 @@ define i16 @test467(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i16* %ptr, i16 %val singlethread release + %ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") release ret i16 %ret } @@ -7902,7 +7902,7 @@ define i16 @test468(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i16* %ptr, i16 %val singlethread acq_rel + %ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") acq_rel ret i16 %ret } @@ -7919,7 +7919,7 @@ define i16 @test469(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i16* %ptr, i16 %val singlethread seq_cst + %ret = atomicrmw xor i16* %ptr, i16 %val syncscope("singlethread") seq_cst ret i16 %ret } @@ -7934,7 +7934,7 @@ define i32 @test470(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i32* %ptr, i32 %val singlethread monotonic + %ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") monotonic ret i32 %ret } @@ -7950,7 +7950,7 @@ define i32 @test471(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i32* %ptr, i32 %val singlethread acquire + %ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") acquire ret i32 %ret } @@ -7966,7 +7966,7 @@ define i32 @test472(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i32* %ptr, i32 %val singlethread release + %ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") release ret i32 %ret } @@ -7983,7 +7983,7 @@ define i32 @test473(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i32* %ptr, i32 %val singlethread acq_rel + %ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") acq_rel ret i32 %ret } @@ -8000,7 +8000,7 @@ define i32 @test474(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i32* %ptr, i32 %val singlethread seq_cst + %ret = atomicrmw xor i32* %ptr, i32 %val syncscope("singlethread") seq_cst ret i32 %ret } @@ -8015,7 +8015,7 @@ define i64 @test475(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i64* %ptr, i64 %val singlethread monotonic + %ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") monotonic ret i64 %ret } @@ -8031,7 +8031,7 @@ define i64 @test476(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i64* %ptr, i64 %val singlethread acquire + %ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") acquire ret i64 %ret } @@ -8047,7 +8047,7 @@ define i64 @test477(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: # BB#2: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i64* %ptr, i64 %val singlethread release + %ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") release ret i64 %ret } @@ -8064,7 +8064,7 @@ define i64 @test478(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i64* %ptr, i64 %val singlethread acq_rel + %ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") acq_rel ret i64 %ret } @@ -8081,7 +8081,7 @@ define i64 @test479(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw xor i64* %ptr, i64 %val singlethread seq_cst + %ret = atomicrmw xor i64* %ptr, i64 %val syncscope("singlethread") seq_cst ret i64 %ret } @@ -8099,7 +8099,7 @@ define i8 @test480(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: .LBB480_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw max i8* %ptr, i8 %val singlethread monotonic + %ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") monotonic ret i8 %ret } @@ -8118,7 +8118,7 @@ define i8 @test481(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: .LBB481_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw max i8* %ptr, i8 %val singlethread acquire + %ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") acquire ret i8 %ret } @@ -8137,7 +8137,7 @@ define i8 @test482(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: .LBB482_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw max i8* %ptr, i8 %val singlethread release + %ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") release ret i8 %ret } @@ -8157,7 +8157,7 @@ define i8 @test483(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw max i8* %ptr, i8 %val singlethread acq_rel + %ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") acq_rel ret i8 %ret } @@ -8177,7 +8177,7 @@ define i8 @test484(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw max i8* %ptr, i8 %val singlethread seq_cst + %ret = atomicrmw max i8* %ptr, i8 %val syncscope("singlethread") seq_cst ret i8 %ret } @@ -8195,7 +8195,7 @@ define i16 @test485(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: .LBB485_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw max i16* %ptr, i16 %val singlethread monotonic + %ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") monotonic ret i16 %ret } @@ -8214,7 +8214,7 @@ define i16 @test486(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: .LBB486_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw max i16* %ptr, i16 %val singlethread acquire + %ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") acquire ret i16 %ret } @@ -8233,7 +8233,7 @@ define i16 @test487(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: .LBB487_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw max i16* %ptr, i16 %val singlethread release + %ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") release ret i16 %ret } @@ -8253,7 +8253,7 @@ define i16 @test488(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw max i16* %ptr, i16 %val singlethread acq_rel + %ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") acq_rel ret i16 %ret } @@ -8273,7 +8273,7 @@ define i16 @test489(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw max i16* %ptr, i16 %val singlethread seq_cst + %ret = atomicrmw max i16* %ptr, i16 %val syncscope("singlethread") seq_cst ret i16 %ret } @@ -8290,7 +8290,7 @@ define i32 @test490(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: .LBB490_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw max i32* %ptr, i32 %val singlethread monotonic + %ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") monotonic ret i32 %ret } @@ -8308,7 +8308,7 @@ define i32 @test491(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: .LBB491_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw max i32* %ptr, i32 %val singlethread acquire + %ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") acquire ret i32 %ret } @@ -8326,7 +8326,7 @@ define i32 @test492(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: .LBB492_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw max i32* %ptr, i32 %val singlethread release + %ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") release ret i32 %ret } @@ -8345,7 +8345,7 @@ define i32 @test493(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw max i32* %ptr, i32 %val singlethread acq_rel + %ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") acq_rel ret i32 %ret } @@ -8364,7 +8364,7 @@ define i32 @test494(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw max i32* %ptr, i32 %val singlethread seq_cst + %ret = atomicrmw max i32* %ptr, i32 %val syncscope("singlethread") seq_cst ret i32 %ret } @@ -8381,7 +8381,7 @@ define i64 @test495(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: .LBB495_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw max i64* %ptr, i64 %val singlethread monotonic + %ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") monotonic ret i64 %ret } @@ -8399,7 +8399,7 @@ define i64 @test496(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: .LBB496_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw max i64* %ptr, i64 %val singlethread acquire + %ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") acquire ret i64 %ret } @@ -8417,7 +8417,7 @@ define i64 @test497(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: .LBB497_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw max i64* %ptr, i64 %val singlethread release + %ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") release ret i64 %ret } @@ -8436,7 +8436,7 @@ define i64 @test498(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw max i64* %ptr, i64 %val singlethread acq_rel + %ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") acq_rel ret i64 %ret } @@ -8455,7 +8455,7 @@ define i64 @test499(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw max i64* %ptr, i64 %val singlethread seq_cst + %ret = atomicrmw max i64* %ptr, i64 %val syncscope("singlethread") seq_cst ret i64 %ret } @@ -8473,7 +8473,7 @@ define i8 @test500(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: .LBB500_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw min i8* %ptr, i8 %val singlethread monotonic + %ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") monotonic ret i8 %ret } @@ -8492,7 +8492,7 @@ define i8 @test501(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: .LBB501_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw min i8* %ptr, i8 %val singlethread acquire + %ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") acquire ret i8 %ret } @@ -8511,7 +8511,7 @@ define i8 @test502(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: .LBB502_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw min i8* %ptr, i8 %val singlethread release + %ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") release ret i8 %ret } @@ -8531,7 +8531,7 @@ define i8 @test503(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw min i8* %ptr, i8 %val singlethread acq_rel + %ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") acq_rel ret i8 %ret } @@ -8551,7 +8551,7 @@ define i8 @test504(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw min i8* %ptr, i8 %val singlethread seq_cst + %ret = atomicrmw min i8* %ptr, i8 %val syncscope("singlethread") seq_cst ret i8 %ret } @@ -8569,7 +8569,7 @@ define i16 @test505(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: .LBB505_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw min i16* %ptr, i16 %val singlethread monotonic + %ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") monotonic ret i16 %ret } @@ -8588,7 +8588,7 @@ define i16 @test506(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: .LBB506_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw min i16* %ptr, i16 %val singlethread acquire + %ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") acquire ret i16 %ret } @@ -8607,7 +8607,7 @@ define i16 @test507(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: .LBB507_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw min i16* %ptr, i16 %val singlethread release + %ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") release ret i16 %ret } @@ -8627,7 +8627,7 @@ define i16 @test508(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw min i16* %ptr, i16 %val singlethread acq_rel + %ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") acq_rel ret i16 %ret } @@ -8647,7 +8647,7 @@ define i16 @test509(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw min i16* %ptr, i16 %val singlethread seq_cst + %ret = atomicrmw min i16* %ptr, i16 %val syncscope("singlethread") seq_cst ret i16 %ret } @@ -8664,7 +8664,7 @@ define i32 @test510(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: .LBB510_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw min i32* %ptr, i32 %val singlethread monotonic + %ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") monotonic ret i32 %ret } @@ -8682,7 +8682,7 @@ define i32 @test511(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: .LBB511_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw min i32* %ptr, i32 %val singlethread acquire + %ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") acquire ret i32 %ret } @@ -8700,7 +8700,7 @@ define i32 @test512(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: .LBB512_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw min i32* %ptr, i32 %val singlethread release + %ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") release ret i32 %ret } @@ -8719,7 +8719,7 @@ define i32 @test513(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw min i32* %ptr, i32 %val singlethread acq_rel + %ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") acq_rel ret i32 %ret } @@ -8738,7 +8738,7 @@ define i32 @test514(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw min i32* %ptr, i32 %val singlethread seq_cst + %ret = atomicrmw min i32* %ptr, i32 %val syncscope("singlethread") seq_cst ret i32 %ret } @@ -8755,7 +8755,7 @@ define i64 @test515(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: .LBB515_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw min i64* %ptr, i64 %val singlethread monotonic + %ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") monotonic ret i64 %ret } @@ -8773,7 +8773,7 @@ define i64 @test516(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: .LBB516_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw min i64* %ptr, i64 %val singlethread acquire + %ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") acquire ret i64 %ret } @@ -8791,7 +8791,7 @@ define i64 @test517(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: .LBB517_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw min i64* %ptr, i64 %val singlethread release + %ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") release ret i64 %ret } @@ -8810,7 +8810,7 @@ define i64 @test518(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw min i64* %ptr, i64 %val singlethread acq_rel + %ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") acq_rel ret i64 %ret } @@ -8829,7 +8829,7 @@ define i64 @test519(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw min i64* %ptr, i64 %val singlethread seq_cst + %ret = atomicrmw min i64* %ptr, i64 %val syncscope("singlethread") seq_cst ret i64 %ret } @@ -8846,7 +8846,7 @@ define i8 @test520(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: .LBB520_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i8* %ptr, i8 %val singlethread monotonic + %ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") monotonic ret i8 %ret } @@ -8864,7 +8864,7 @@ define i8 @test521(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: .LBB521_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i8* %ptr, i8 %val singlethread acquire + %ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") acquire ret i8 %ret } @@ -8882,7 +8882,7 @@ define i8 @test522(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: .LBB522_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i8* %ptr, i8 %val singlethread release + %ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") release ret i8 %ret } @@ -8901,7 +8901,7 @@ define i8 @test523(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i8* %ptr, i8 %val singlethread acq_rel + %ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") acq_rel ret i8 %ret } @@ -8920,7 +8920,7 @@ define i8 @test524(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i8* %ptr, i8 %val singlethread seq_cst + %ret = atomicrmw umax i8* %ptr, i8 %val syncscope("singlethread") seq_cst ret i8 %ret } @@ -8937,7 +8937,7 @@ define i16 @test525(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: .LBB525_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i16* %ptr, i16 %val singlethread monotonic + %ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") monotonic ret i16 %ret } @@ -8955,7 +8955,7 @@ define i16 @test526(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: .LBB526_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i16* %ptr, i16 %val singlethread acquire + %ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") acquire ret i16 %ret } @@ -8973,7 +8973,7 @@ define i16 @test527(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: .LBB527_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i16* %ptr, i16 %val singlethread release + %ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") release ret i16 %ret } @@ -8992,7 +8992,7 @@ define i16 @test528(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i16* %ptr, i16 %val singlethread acq_rel + %ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") acq_rel ret i16 %ret } @@ -9011,7 +9011,7 @@ define i16 @test529(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i16* %ptr, i16 %val singlethread seq_cst + %ret = atomicrmw umax i16* %ptr, i16 %val syncscope("singlethread") seq_cst ret i16 %ret } @@ -9028,7 +9028,7 @@ define i32 @test530(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: .LBB530_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i32* %ptr, i32 %val singlethread monotonic + %ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") monotonic ret i32 %ret } @@ -9046,7 +9046,7 @@ define i32 @test531(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: .LBB531_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i32* %ptr, i32 %val singlethread acquire + %ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") acquire ret i32 %ret } @@ -9064,7 +9064,7 @@ define i32 @test532(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: .LBB532_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i32* %ptr, i32 %val singlethread release + %ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") release ret i32 %ret } @@ -9083,7 +9083,7 @@ define i32 @test533(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i32* %ptr, i32 %val singlethread acq_rel + %ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") acq_rel ret i32 %ret } @@ -9102,7 +9102,7 @@ define i32 @test534(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i32* %ptr, i32 %val singlethread seq_cst + %ret = atomicrmw umax i32* %ptr, i32 %val syncscope("singlethread") seq_cst ret i32 %ret } @@ -9119,7 +9119,7 @@ define i64 @test535(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: .LBB535_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i64* %ptr, i64 %val singlethread monotonic + %ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") monotonic ret i64 %ret } @@ -9137,7 +9137,7 @@ define i64 @test536(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: .LBB536_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i64* %ptr, i64 %val singlethread acquire + %ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") acquire ret i64 %ret } @@ -9155,7 +9155,7 @@ define i64 @test537(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: .LBB537_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i64* %ptr, i64 %val singlethread release + %ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") release ret i64 %ret } @@ -9174,7 +9174,7 @@ define i64 @test538(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i64* %ptr, i64 %val singlethread acq_rel + %ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") acq_rel ret i64 %ret } @@ -9193,7 +9193,7 @@ define i64 @test539(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umax i64* %ptr, i64 %val singlethread seq_cst + %ret = atomicrmw umax i64* %ptr, i64 %val syncscope("singlethread") seq_cst ret i64 %ret } @@ -9210,7 +9210,7 @@ define i8 @test540(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: .LBB540_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i8* %ptr, i8 %val singlethread monotonic + %ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") monotonic ret i8 %ret } @@ -9228,7 +9228,7 @@ define i8 @test541(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: .LBB541_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i8* %ptr, i8 %val singlethread acquire + %ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") acquire ret i8 %ret } @@ -9246,7 +9246,7 @@ define i8 @test542(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: .LBB542_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i8* %ptr, i8 %val singlethread release + %ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") release ret i8 %ret } @@ -9265,7 +9265,7 @@ define i8 @test543(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i8* %ptr, i8 %val singlethread acq_rel + %ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") acq_rel ret i8 %ret } @@ -9284,7 +9284,7 @@ define i8 @test544(i8* %ptr, i8 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i8* %ptr, i8 %val singlethread seq_cst + %ret = atomicrmw umin i8* %ptr, i8 %val syncscope("singlethread") seq_cst ret i8 %ret } @@ -9301,7 +9301,7 @@ define i16 @test545(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: .LBB545_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i16* %ptr, i16 %val singlethread monotonic + %ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") monotonic ret i16 %ret } @@ -9319,7 +9319,7 @@ define i16 @test546(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: .LBB546_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i16* %ptr, i16 %val singlethread acquire + %ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") acquire ret i16 %ret } @@ -9337,7 +9337,7 @@ define i16 @test547(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: .LBB547_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i16* %ptr, i16 %val singlethread release + %ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") release ret i16 %ret } @@ -9356,7 +9356,7 @@ define i16 @test548(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i16* %ptr, i16 %val singlethread acq_rel + %ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") acq_rel ret i16 %ret } @@ -9375,7 +9375,7 @@ define i16 @test549(i16* %ptr, i16 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i16* %ptr, i16 %val singlethread seq_cst + %ret = atomicrmw umin i16* %ptr, i16 %val syncscope("singlethread") seq_cst ret i16 %ret } @@ -9392,7 +9392,7 @@ define i32 @test550(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: .LBB550_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i32* %ptr, i32 %val singlethread monotonic + %ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") monotonic ret i32 %ret } @@ -9410,7 +9410,7 @@ define i32 @test551(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: .LBB551_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i32* %ptr, i32 %val singlethread acquire + %ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") acquire ret i32 %ret } @@ -9428,7 +9428,7 @@ define i32 @test552(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: .LBB552_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i32* %ptr, i32 %val singlethread release + %ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") release ret i32 %ret } @@ -9447,7 +9447,7 @@ define i32 @test553(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i32* %ptr, i32 %val singlethread acq_rel + %ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") acq_rel ret i32 %ret } @@ -9466,7 +9466,7 @@ define i32 @test554(i32* %ptr, i32 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i32* %ptr, i32 %val singlethread seq_cst + %ret = atomicrmw umin i32* %ptr, i32 %val syncscope("singlethread") seq_cst ret i32 %ret } @@ -9483,7 +9483,7 @@ define i64 @test555(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: .LBB555_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i64* %ptr, i64 %val singlethread monotonic + %ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") monotonic ret i64 %ret } @@ -9501,7 +9501,7 @@ define i64 @test556(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: .LBB556_3: ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i64* %ptr, i64 %val singlethread acquire + %ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") acquire ret i64 %ret } @@ -9519,7 +9519,7 @@ define i64 @test557(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: .LBB557_3: ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i64* %ptr, i64 %val singlethread release + %ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") release ret i64 %ret } @@ -9538,7 +9538,7 @@ define i64 @test558(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i64* %ptr, i64 %val singlethread acq_rel + %ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") acq_rel ret i64 %ret } @@ -9557,7 +9557,7 @@ define i64 @test559(i64* %ptr, i64 %val) { ; PPC64LE-NEXT: mr 3, 5 ; PPC64LE-NEXT: lwsync ; PPC64LE-NEXT: blr - %ret = atomicrmw umin i64* %ptr, i64 %val singlethread seq_cst + %ret = atomicrmw umin i64* %ptr, i64 %val syncscope("singlethread") seq_cst ret i64 %ret } diff --git a/test/CodeGen/PowerPC/bitreverse.ll b/test/CodeGen/PowerPC/bitreverse.ll deleted file mode 100644 index dca7340d035d6..0000000000000 --- a/test/CodeGen/PowerPC/bitreverse.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: llc -verify-machineinstrs -march=ppc64 %s -o - | FileCheck %s - -; These tests just check that the plumbing is in place for @llvm.bitreverse. The -; actual output is massive at the moment as llvm.bitreverse is not yet legal. - -declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone - -define <2 x i16> @f(<2 x i16> %a) { -; CHECK-LABEL: f: -; CHECK: rlwinm - %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a) - ret <2 x i16> %b -} - -declare i8 @llvm.bitreverse.i8(i8) readnone - -define i8 @g(i8 %a) { -; CHECK-LABEL: g: -; CHECK: rlwinm -; CHECK: rlwimi - %b = call i8 @llvm.bitreverse.i8(i8 %a) - ret i8 %b -} diff --git a/test/CodeGen/PowerPC/build-vector-tests.ll b/test/CodeGen/PowerPC/build-vector-tests.ll index c42f677d17ab1..60bec4d18f12e 100644 --- a/test/CodeGen/PowerPC/build-vector-tests.ll +++ b/test/CodeGen/PowerPC/build-vector-tests.ll @@ -1028,7 +1028,7 @@ entry: ; P9LE: vperm ; P9LE: blr ; P8BE: sldi {{r[0-9]+}}, r4, 2 -; P8BE-DAG: lxvw4x {{v[0-9]+}}, r3, +; P8BE-DAG: lxvw4x {{v[0-9]+}}, 0, r3 ; P8BE-DAG: lxvw4x ; P8BE: vperm ; P8BE: blr @@ -2187,7 +2187,7 @@ entry: ; P9LE: vperm ; P9LE: blr ; P8BE-DAG: sldi {{r[0-9]+}}, r4, 2 -; P8BE-DAG: lxvw4x {{v[0-9]+}}, r3 +; P8BE-DAG: lxvw4x {{v[0-9]+}}, 0, r3 ; P8BE-DAG: lxvw4x ; P8BE: vperm ; P8BE: blr diff --git a/test/CodeGen/PowerPC/ppc-ctr-dead-code.ll b/test/CodeGen/PowerPC/ppc-ctr-dead-code.ll new file mode 100644 index 0000000000000..71755f722cb2c --- /dev/null +++ b/test/CodeGen/PowerPC/ppc-ctr-dead-code.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s + +; Function Attrs: norecurse nounwind readonly +define signext i32 @limit_loop(i32 signext %iters, i32* nocapture readonly %vec, i32 signext %limit) local_unnamed_addr { +entry: + %cmp5 = icmp sgt i32 %iters, 0 + br i1 %cmp5, label %for.body.preheader, label %cleanup + +for.body.preheader: ; preds = %entry + %0 = sext i32 %iters to i64 + br label %for.body + +for.cond: ; preds = %for.body + %cmp = icmp slt i64 %indvars.iv.next, %0 + br i1 %cmp, label %for.body, label %cleanup + +for.body: ; preds = %for.body.preheader, %for.cond + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.cond ] + %arrayidx = getelementptr inbounds i32, i32* %vec, i64 %indvars.iv + %1 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp slt i32 %1, %limit + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br i1 %cmp1, label %for.cond, label %cleanup + +cleanup: ; preds = %for.body, %for.cond, %entry + %2 = phi i32 [ 0, %entry ], [ 0, %for.cond ], [ 1, %for.body ] + ret i32 %2 +; CHECK-LABEL: limit_loop +; CHECK: mtctr +; CHECK-NOT: addi {{[0-9]+}}, {{[0-9]+}}, 1 +; CHECK: bdnz +; CHECK: blr +} + + diff --git a/test/CodeGen/PowerPC/ppc-redzone-alignment-bug.ll b/test/CodeGen/PowerPC/ppc-redzone-alignment-bug.ll new file mode 100644 index 0000000000000..87b45beeab7e0 --- /dev/null +++ b/test/CodeGen/PowerPC/ppc-redzone-alignment-bug.ll @@ -0,0 +1,32 @@ +; Note the formula for negative number alignment calculation should be y = x & ~(n-1) rather than y = (x + (n-1)) & ~(n-1). +; after patch https://reviews.llvm.org/D34337, we could save 16 bytes in the best case. +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-LE + +define signext i32 @bar(i32 signext %ii) { +entry: + %0 = tail call i32 asm sideeffect "add $0, $1, $2\0A", "=r,r,r,~{f14},~{r15},~{v20}"(i32 %ii, i32 10) + ret i32 %0 +; Before the fix by patch D34337: +; stdu 1, -544(1) +; std 15, 264(1) +; stfd 14, 400(1) +; stdu 1, -560(1) +; std 15, 280(1) +; stfd 14, 416(1) + +; After the fix by patch D34337: +; CHECK-LE: stdu 1, -528(1) +; CHECK-LE:std 15, 248(1) +; CHECK-LE:stfd 14, 384(1) +; CHECK-BE: stdu 1, -544(1) +; CHECK-BE:std 15, 264(1) +; CHECK-BE:stfd 14, 400(1) +} + +define signext i32 @foo() { +entry: + %call = tail call signext i32 @bar(i32 signext 5) + ret i32 %call +} + diff --git a/test/CodeGen/PowerPC/ppc64le-smallarg.ll b/test/CodeGen/PowerPC/ppc64le-smallarg.ll index 0e871c3588691..3a425406d043f 100644 --- a/test/CodeGen/PowerPC/ppc64le-smallarg.ll +++ b/test/CodeGen/PowerPC/ppc64le-smallarg.ll @@ -53,8 +53,8 @@ entry: ret void } ; CHECK: @caller2 -; CHECK: li [[TOCOFF:[0-9]+]], 136 -; CHECK: stxsspx {{[0-9]+}}, 1, [[TOCOFF]] +; CHECK: addi [[TOCOFF:[0-9]+]], {{[0-9]+}}, 136 +; CHECK: stxsspx {{[0-9]+}}, 0, [[TOCOFF]] ; CHECK: bl test2 declare float @test2(float, float, float, float, float, float, float, float, float, float, float, float, float, float) diff --git a/test/CodeGen/PowerPC/pr33093.ll b/test/CodeGen/PowerPC/pr33093.ll new file mode 100644 index 0000000000000..5212973f8317c --- /dev/null +++ b/test/CodeGen/PowerPC/pr33093.ll @@ -0,0 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s +; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s + +define zeroext i32 @ReverseBits(i32 zeroext %n) { +; CHECK-LABEL: ReverseBits: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: lis 4, -21846 +; CHECK-NEXT: lis 5, 21845 +; CHECK-NEXT: slwi 6, 3, 1 +; CHECK-NEXT: srwi 3, 3, 1 +; CHECK-NEXT: lis 7, -13108 +; CHECK-NEXT: lis 8, 13107 +; CHECK-NEXT: ori 4, 4, 43690 +; CHECK-NEXT: ori 5, 5, 21845 +; CHECK-NEXT: lis 10, -3856 +; CHECK-NEXT: lis 11, 3855 +; CHECK-NEXT: and 3, 3, 5 +; CHECK-NEXT: and 4, 6, 4 +; CHECK-NEXT: ori 5, 8, 13107 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: ori 4, 7, 52428 +; CHECK-NEXT: slwi 9, 3, 2 +; CHECK-NEXT: srwi 3, 3, 2 +; CHECK-NEXT: and 3, 3, 5 +; CHECK-NEXT: and 4, 9, 4 +; CHECK-NEXT: ori 5, 11, 3855 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: ori 4, 10, 61680 +; CHECK-NEXT: slwi 12, 3, 4 +; CHECK-NEXT: srwi 3, 3, 4 +; CHECK-NEXT: and 4, 12, 4 +; CHECK-NEXT: and 3, 3, 5 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: rotlwi 4, 3, 24 +; CHECK-NEXT: rlwimi 4, 3, 8, 8, 15 +; CHECK-NEXT: rlwimi 4, 3, 8, 24, 31 +; CHECK-NEXT: rldicl 3, 4, 0, 32 +; CHECK-NEXT: clrldi 3, 3, 32 +; CHECK-NEXT: blr +entry: + %shr = lshr i32 %n, 1 + %and = and i32 %shr, 1431655765 + %and1 = shl i32 %n, 1 + %shl = and i32 %and1, -1431655766 + %or = or i32 %and, %shl + %shr2 = lshr i32 %or, 2 + %and3 = and i32 %shr2, 858993459 + %and4 = shl i32 %or, 2 + %shl5 = and i32 %and4, -858993460 + %or6 = or i32 %and3, %shl5 + %shr7 = lshr i32 %or6, 4 + %and8 = and i32 %shr7, 252645135 + %and9 = shl i32 %or6, 4 + %shl10 = and i32 %and9, -252645136 + %or11 = or i32 %and8, %shl10 + %shr13 = lshr i32 %or11, 24 + %and14 = lshr i32 %or11, 8 + %shr15 = and i32 %and14, 65280 + %and17 = shl i32 %or11, 8 + %shl18 = and i32 %and17, 16711680 + %shl21 = shl i32 %or11, 24 + %or16 = or i32 %shl21, %shr13 + %or19 = or i32 %or16, %shr15 + %or22 = or i32 %or19, %shl18 + ret i32 %or22 +} + +define i64 @ReverseBits64(i64 %n) { +; CHECK-LABEL: ReverseBits64: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: lis 4, -21846 +; CHECK-NEXT: lis 5, 21845 +; CHECK-NEXT: lis 6, -13108 +; CHECK-NEXT: lis 7, 13107 +; CHECK-NEXT: sldi 8, 3, 1 +; CHECK-NEXT: rldicl 3, 3, 63, 1 +; CHECK-NEXT: ori 4, 4, 43690 +; CHECK-NEXT: ori 5, 5, 21845 +; CHECK-NEXT: ori 6, 6, 52428 +; CHECK-NEXT: ori 7, 7, 13107 +; CHECK-NEXT: sldi 4, 4, 32 +; CHECK-NEXT: sldi 5, 5, 32 +; CHECK-NEXT: oris 4, 4, 43690 +; CHECK-NEXT: oris 5, 5, 21845 +; CHECK-NEXT: ori 4, 4, 43690 +; CHECK-NEXT: ori 5, 5, 21845 +; CHECK-NEXT: and 3, 3, 5 +; CHECK-NEXT: sldi 5, 6, 32 +; CHECK-NEXT: sldi 6, 7, 32 +; CHECK-NEXT: and 4, 8, 4 +; CHECK-NEXT: lis 7, 3855 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: oris 12, 5, 52428 +; CHECK-NEXT: oris 9, 6, 13107 +; CHECK-NEXT: lis 6, -3856 +; CHECK-NEXT: ori 7, 7, 3855 +; CHECK-NEXT: sldi 8, 3, 2 +; CHECK-NEXT: ori 4, 12, 52428 +; CHECK-NEXT: rldicl 3, 3, 62, 2 +; CHECK-NEXT: ori 5, 9, 13107 +; CHECK-NEXT: ori 6, 6, 61680 +; CHECK-NEXT: and 3, 3, 5 +; CHECK-NEXT: sldi 5, 6, 32 +; CHECK-NEXT: and 4, 8, 4 +; CHECK-NEXT: sldi 6, 7, 32 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: oris 10, 5, 61680 +; CHECK-NEXT: oris 11, 6, 3855 +; CHECK-NEXT: sldi 6, 3, 4 +; CHECK-NEXT: ori 4, 10, 61680 +; CHECK-NEXT: rldicl 3, 3, 60, 4 +; CHECK-NEXT: ori 5, 11, 3855 +; CHECK-NEXT: and 4, 6, 4 +; CHECK-NEXT: and 3, 3, 5 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: rldicl 4, 3, 32, 32 +; CHECK-NEXT: rlwinm 6, 3, 24, 0, 31 +; CHECK-NEXT: rlwinm 5, 4, 24, 0, 31 +; CHECK-NEXT: rlwimi 6, 3, 8, 8, 15 +; CHECK-NEXT: rlwimi 5, 4, 8, 8, 15 +; CHECK-NEXT: rlwimi 6, 3, 8, 24, 31 +; CHECK-NEXT: rlwimi 5, 4, 8, 24, 31 +; CHECK-NEXT: sldi 12, 5, 32 +; CHECK-NEXT: or 3, 12, 6 +; CHECK-NEXT: blr +entry: + %shr = lshr i64 %n, 1 + %and = and i64 %shr, 6148914691236517205 + %and1 = shl i64 %n, 1 + %shl = and i64 %and1, -6148914691236517206 + %or = or i64 %and, %shl + %shr2 = lshr i64 %or, 2 + %and3 = and i64 %shr2, 3689348814741910323 + %and4 = shl i64 %or, 2 + %shl5 = and i64 %and4, -3689348814741910324 + %or6 = or i64 %and3, %shl5 + %shr7 = lshr i64 %or6, 4 + %and8 = and i64 %shr7, 1085102592571150095 + %and9 = shl i64 %or6, 4 + %shl10 = and i64 %and9, -1085102592571150096 + %or11 = or i64 %and8, %shl10 + %shr13 = lshr i64 %or11, 56 + %and14 = lshr i64 %or11, 40 + %shr15 = and i64 %and14, 65280 + %and17 = lshr i64 %or11, 24 + %shr18 = and i64 %and17, 16711680 + %and20 = lshr i64 %or11, 8 + %shr21 = and i64 %and20, 4278190080 + %and23 = shl i64 %or11, 8 + %shl24 = and i64 %and23, 1095216660480 + %and26 = shl i64 %or11, 24 + %shl27 = and i64 %and26, 280375465082880 + %and29 = shl i64 %or11, 40 + %shl30 = and i64 %and29, 71776119061217280 + %shl33 = shl i64 %or11, 56 + %or16 = or i64 %shl33, %shr13 + %or19 = or i64 %or16, %shr15 + %or22 = or i64 %or19, %shr18 + %or25 = or i64 %or22, %shr21 + %or28 = or i64 %or25, %shl24 + %or31 = or i64 %or28, %shl27 + %or34 = or i64 %or31, %shl30 + ret i64 %or34 +} diff --git a/test/CodeGen/PowerPC/select-addrRegRegOnly.ll b/test/CodeGen/PowerPC/select-addrRegRegOnly.ll new file mode 100644 index 0000000000000..f880d1faf9d90 --- /dev/null +++ b/test/CodeGen/PowerPC/select-addrRegRegOnly.ll @@ -0,0 +1,37 @@ +; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-unknown -verify-machineinstrs < %s | FileCheck %s + +; Function Attrs: norecurse nounwind readonly +define float @testSingleAccess(i32* nocapture readonly %arr) local_unnamed_addr #0 { +; CHECK-LABEL: testSingleAccess: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addi 3, 3, 8 +; CHECK-NEXT: lxsiwax 0, 0, 3 +; CHECK-NEXT: xscvsxdsp 1, 0 +; CHECK-NEXT: blr +entry: + %arrayidx = getelementptr inbounds i32, i32* %arr, i64 2 + %0 = load i32, i32* %arrayidx, align 4 + %conv = sitofp i32 %0 to float + ret float %conv +} + +; Function Attrs: norecurse nounwind readonly +define float @testMultipleAccess(i32* nocapture readonly %arr) local_unnamed_addr #0 { +; CHECK-LABEL: testMultipleAccess: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: lwz 4, 8(3) +; CHECK-NEXT: lwz 12, 12(3) +; CHECK-NEXT: add 3, 12, 4 +; CHECK-NEXT: mtvsrwa 0, 3 +; CHECK-NEXT: xscvsxdsp 1, 0 +; CHECK-NEXT: blr +entry: + %arrayidx = getelementptr inbounds i32, i32* %arr, i64 2 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 3 + %1 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %1, %0 + %conv = sitofp i32 %add to float + ret float %conv +} diff --git a/test/CodeGen/PowerPC/svr4-redzone.ll b/test/CodeGen/PowerPC/svr4-redzone.ll index 7bb6cc180c962..26c4410ded6d2 100644 --- a/test/CodeGen/PowerPC/svr4-redzone.ll +++ b/test/CodeGen/PowerPC/svr4-redzone.ll @@ -29,11 +29,11 @@ entry: define i8* @bigstack() nounwind { entry: - %0 = alloca i8, i32 230 + %0 = alloca i8, i32 290 ret i8* %0 } ; PPC32-LABEL: bigstack: -; PPC32: stwu 1, -240(1) +; PPC32: stwu 1, -304(1) ; PPC64-LABEL: bigstack: -; PPC64: stdu 1, -288(1) +; PPC64: stdu 1, -352(1) diff --git a/test/CodeGen/PowerPC/tailcall1-64.ll b/test/CodeGen/PowerPC/tailcall1-64.ll index 3dc2672556eaf..58ab0bce309c2 100644 --- a/test/CodeGen/PowerPC/tailcall1-64.ll +++ b/test/CodeGen/PowerPC/tailcall1-64.ll @@ -1,4 +1,5 @@ -; RUN: llc -relocation-model=static -verify-machineinstrs < %s -march=ppc64 -tailcallopt | grep TC_RETURNd8 +; RUN: llc -relocation-model=static -verify-machineinstrs < %s -mtriple=ppc64-- -tailcallopt | grep TC_RETURNd8 +; RUN: llc -relocation-model=static -verify-machineinstrs -mtriple=ppc64-- < %s | FileCheck %s define fastcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { entry: ret i32 %a3 @@ -6,6 +7,8 @@ entry: define fastcc i32 @tailcaller(i32 %in1, i32 %in2) { entry: - %tmp11 = tail call fastcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 ) ; <i32> [#uses=1] + %tmp11 = tail call fastcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 ) ret i32 %tmp11 +; CHECK-LABEL: tailcaller +; CHECK-NOT: stdu } diff --git a/test/CodeGen/PowerPC/testBitReverse.ll b/test/CodeGen/PowerPC/testBitReverse.ll new file mode 100644 index 0000000000000..6993d17ad8f34 --- /dev/null +++ b/test/CodeGen/PowerPC/testBitReverse.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s +declare i32 @llvm.bitreverse.i32(i32) +define i32 @testBitReverseIntrinsicI32(i32 %arg) { +; CHECK-LABEL: testBitReverseIntrinsicI32: +; CHECK: # BB#0: +; CHECK-NEXT: lis 4, -21846 +; CHECK-NEXT: lis 5, 21845 +; CHECK-NEXT: slwi 6, 3, 1 +; CHECK-NEXT: srwi 3, 3, 1 +; CHECK-NEXT: lis 7, -13108 +; CHECK-NEXT: lis 8, 13107 +; CHECK-NEXT: ori 4, 4, 43690 +; CHECK-NEXT: ori 5, 5, 21845 +; CHECK-NEXT: lis 10, -3856 +; CHECK-NEXT: lis 11, 3855 +; CHECK-NEXT: and 3, 3, 5 +; CHECK-NEXT: and 4, 6, 4 +; CHECK-NEXT: ori 5, 8, 13107 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: ori 4, 7, 52428 +; CHECK-NEXT: slwi 9, 3, 2 +; CHECK-NEXT: srwi 3, 3, 2 +; CHECK-NEXT: and 3, 3, 5 +; CHECK-NEXT: and 4, 9, 4 +; CHECK-NEXT: ori 5, 11, 3855 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: ori 4, 10, 61680 +; CHECK-NEXT: slwi 12, 3, 4 +; CHECK-NEXT: srwi 3, 3, 4 +; CHECK-NEXT: and 4, 12, 4 +; CHECK-NEXT: and 3, 3, 5 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: rotlwi 4, 3, 24 +; CHECK-NEXT: rlwimi 4, 3, 8, 8, 15 +; CHECK-NEXT: rlwimi 4, 3, 8, 24, 31 +; CHECK-NEXT: rldicl 3, 4, 0, 32 +; CHECK-NEXT: blr + %res = call i32 @llvm.bitreverse.i32(i32 %arg) + ret i32 %res +} + +declare i64 @llvm.bitreverse.i64(i64) +define i64 @testBitReverseIntrinsicI64(i64 %arg) { +; CHECK-LABEL: testBitReverseIntrinsicI64: +; CHECK: # BB#0: +; CHECK-NEXT: lis 4, -21846 +; CHECK-NEXT: lis 5, 21845 +; CHECK-NEXT: lis 6, -13108 +; CHECK-NEXT: lis 7, 13107 +; CHECK-NEXT: sldi 8, 3, 1 +; CHECK-NEXT: rldicl 3, 3, 63, 1 +; CHECK-NEXT: ori 4, 4, 43690 +; CHECK-NEXT: ori 5, 5, 21845 +; CHECK-NEXT: ori 6, 6, 52428 +; CHECK-NEXT: ori 7, 7, 13107 +; CHECK-NEXT: sldi 4, 4, 32 +; CHECK-NEXT: sldi 5, 5, 32 +; CHECK-NEXT: oris 4, 4, 43690 +; CHECK-NEXT: oris 5, 5, 21845 +; CHECK-NEXT: ori 4, 4, 43690 +; CHECK-NEXT: ori 5, 5, 21845 +; CHECK-NEXT: and 3, 3, 5 +; CHECK-NEXT: sldi 5, 6, 32 +; CHECK-NEXT: sldi 6, 7, 32 +; CHECK-NEXT: and 4, 8, 4 +; CHECK-NEXT: lis 7, 3855 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: oris 12, 5, 52428 +; CHECK-NEXT: oris 9, 6, 13107 +; CHECK-NEXT: lis 6, -3856 +; CHECK-NEXT: ori 7, 7, 3855 +; CHECK-NEXT: sldi 8, 3, 2 +; CHECK-NEXT: ori 4, 12, 52428 +; CHECK-NEXT: rldicl 3, 3, 62, 2 +; CHECK-NEXT: ori 5, 9, 13107 +; CHECK-NEXT: ori 6, 6, 61680 +; CHECK-NEXT: and 3, 3, 5 +; CHECK-NEXT: sldi 5, 6, 32 +; CHECK-NEXT: and 4, 8, 4 +; CHECK-NEXT: sldi 6, 7, 32 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: oris 10, 5, 61680 +; CHECK-NEXT: oris 11, 6, 3855 +; CHECK-NEXT: sldi 6, 3, 4 +; CHECK-NEXT: ori 4, 10, 61680 +; CHECK-NEXT: rldicl 3, 3, 60, 4 +; CHECK-NEXT: ori 5, 11, 3855 +; CHECK-NEXT: and 4, 6, 4 +; CHECK-NEXT: and 3, 3, 5 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: rldicl 4, 3, 32, 32 +; CHECK-NEXT: rlwinm 6, 3, 24, 0, 31 +; CHECK-NEXT: rlwinm 5, 4, 24, 0, 31 +; CHECK-NEXT: rlwimi 6, 3, 8, 8, 15 +; CHECK-NEXT: rlwimi 5, 4, 8, 8, 15 +; CHECK-NEXT: rlwimi 6, 3, 8, 24, 31 +; CHECK-NEXT: rlwimi 5, 4, 8, 24, 31 +; CHECK-NEXT: sldi 12, 5, 32 +; CHECK-NEXT: or 3, 12, 6 +; CHECK-NEXT: blr + %res = call i64 @llvm.bitreverse.i64(i64 %arg) + ret i64 %res +} diff --git a/test/CodeGen/PowerPC/vec_extract_p9.ll b/test/CodeGen/PowerPC/vec_extract_p9.ll new file mode 100644 index 0000000000000..241209a0e6b75 --- /dev/null +++ b/test/CodeGen/PowerPC/vec_extract_p9.ll @@ -0,0 +1,167 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-LE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-BE + +define zeroext i8 @test1(<16 x i8> %a, i32 signext %index) { +; CHECK-LE-LABEL: test1: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vextubrx 3, 5, 2 +; CHECK-LE-NEXT: clrldi 3, 3, 56 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: test1: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: vextublx 3, 5, 2 +; CHECK-BE-NEXT: clrldi 3, 3, 56 +; CHECK-BE-NEXT: blr + +entry: + %vecext = extractelement <16 x i8> %a, i32 %index + ret i8 %vecext +} + +define signext i8 @test2(<16 x i8> %a, i32 signext %index) { +; CHECK-LE-LABEL: test2: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vextubrx 3, 5, 2 +; CHECK-LE-NEXT: extsb 3, 3 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: test2: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: vextublx 3, 5, 2 +; CHECK-BE-NEXT: extsb 3, 3 +; CHECK-BE-NEXT: blr + +entry: + %vecext = extractelement <16 x i8> %a, i32 %index + ret i8 %vecext +} + +define zeroext i16 @test3(<8 x i16> %a, i32 signext %index) { +; CHECK-LE-LABEL: test3: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: rlwinm 3, 5, 1, 28, 30 +; CHECK-LE-NEXT: vextuhrx 3, 3, 2 +; CHECK-LE-NEXT: clrldi 3, 3, 48 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: test3: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: rlwinm 3, 5, 1, 28, 30 +; CHECK-BE-NEXT: vextuhlx 3, 3, 2 +; CHECK-BE-NEXT: clrldi 3, 3, 48 +; CHECK-BE-NEXT: blr + +entry: + %vecext = extractelement <8 x i16> %a, i32 %index + ret i16 %vecext +} + +define signext i16 @test4(<8 x i16> %a, i32 signext %index) { +; CHECK-LE-LABEL: test4: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: rlwinm 3, 5, 1, 28, 30 +; CHECK-LE-NEXT: vextuhrx 3, 3, 2 +; CHECK-LE-NEXT: extsh 3, 3 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: test4: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: rlwinm 3, 5, 1, 28, 30 +; CHECK-BE-NEXT: vextuhlx 3, 3, 2 +; CHECK-BE-NEXT: extsh 3, 3 +; CHECK-BE-NEXT: blr + +entry: + %vecext = extractelement <8 x i16> %a, i32 %index + ret i16 %vecext +} + +define zeroext i32 @test5(<4 x i32> %a, i32 signext %index) { +; CHECK-LE-LABEL: test5: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: rlwinm 3, 5, 2, 28, 29 +; CHECK-LE-NEXT: vextuwrx 3, 3, 2 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: test5: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: rlwinm 3, 5, 2, 28, 29 +; CHECK-BE-NEXT: vextuwlx 3, 3, 2 +; CHECK-BE-NEXT: blr + +entry: + %vecext = extractelement <4 x i32> %a, i32 %index + ret i32 %vecext +} + +define signext i32 @test6(<4 x i32> %a, i32 signext %index) { +; CHECK-LE-LABEL: test6: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: rlwinm 3, 5, 2, 28, 29 +; CHECK-LE-NEXT: vextuwrx 3, 3, 2 +; CHECK-LE-NEXT: extsw 3, 3 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: test6: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: rlwinm 3, 5, 2, 28, 29 +; CHECK-BE-NEXT: vextuwlx 3, 3, 2 +; CHECK-BE-NEXT: extsw 3, 3 +; CHECK-BE-NEXT: blr + +entry: + %vecext = extractelement <4 x i32> %a, i32 %index + ret i32 %vecext +} + +; Test with immediate index +define zeroext i8 @test7(<16 x i8> %a) { +; CHECK-LE-LABEL: test7: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: li 3, 1 +; CHECK-LE-NEXT: vextubrx 3, 3, 2 +; CHECK-LE-NEXT: clrldi 3, 3, 56 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: test7: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: li 3, 1 +; CHECK-BE-NEXT: vextublx 3, 3, 2 +; CHECK-BE-NEXT: clrldi 3, 3, 56 +; CHECK-BE-NEXT: blr + +entry: + %vecext = extractelement <16 x i8> %a, i32 1 + ret i8 %vecext +} + +define zeroext i16 @test8(<8 x i16> %a) { +; CHECK-LE-LABEL: test8: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: li 3, 2 +; CHECK-LE-NEXT: vextuhrx 3, 3, 2 +; CHECK-LE-NEXT: clrldi 3, 3, 48 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: test8: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: li 3, 2 +; CHECK-BE-NEXT: vextuhlx 3, 3, 2 +; CHECK-BE-NEXT: clrldi 3, 3, 48 +; CHECK-BE-NEXT: blr + +entry: + %vecext = extractelement <8 x i16> %a, i32 1 + ret i16 %vecext +} + +define zeroext i32 @test9(<4 x i32> %a) { +; CHECK-LE-LABEL: test9: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: li 3, 4 +; CHECK-LE-NEXT: vextuwrx 3, 3, 2 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: test9: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: li 3, 4 +; CHECK-BE-NEXT: vextuwlx 3, 3, 2 +; CHECK-BE-NEXT: blr + +entry: + %vecext = extractelement <4 x i32> %a, i32 1 + ret i32 %vecext +} diff --git a/test/CodeGen/PowerPC/vec_int_ext.ll b/test/CodeGen/PowerPC/vec_int_ext.ll index 9e1218c423b7c..d7bed503318eb 100644 --- a/test/CodeGen/PowerPC/vec_int_ext.ll +++ b/test/CodeGen/PowerPC/vec_int_ext.ll @@ -1,12 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -verify-machineinstrs -mcpu=pwr9 < %s | FileCheck %s -check-prefix=PWR9 -target triple = "powerpc64le-unknown-linux-gnu" - -define <4 x i32> @vextsb2w(<16 x i8> %a) { -; PWR9-LABEL: vextsb2w: -; PWR9: # BB#0: # %entry -; PWR9-NEXT: vextsb2w 2, 2 -; PWR9-NEXT: blr +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-LE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-BE + +define <4 x i32> @vextsb2wLE(<16 x i8> %a) { +; CHECK-LE-LABEL: vextsb2wLE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vextsb2w 2, 2 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: vextsb2wLE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE: vperm 2, 2, 2, 3 +; CHECK-BE-NEXT: vextsb2w 2, 2 +; CHECK-BE-NEXT: blr + entry: %vecext = extractelement <16 x i8> %a, i32 0 %conv = sext i8 %vecext to i32 @@ -23,11 +29,17 @@ entry: ret <4 x i32> %vecinit9 } -define <2 x i64> @vextsb2d(<16 x i8> %a) { -; PWR9-LABEL: vextsb2d: -; PWR9: # BB#0: # %entry -; PWR9-NEXT: vextsb2d 2, 2 -; PWR9-NEXT: blr +define <2 x i64> @vextsb2dLE(<16 x i8> %a) { +; CHECK-LE-LABEL: vextsb2dLE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vextsb2d 2, 2 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: vextsb2dLE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE: vperm 2, 2, 2, 3 +; CHECK-BE-NEXT: vextsb2d 2, 2 +; CHECK-BE-NEXT: blr + entry: %vecext = extractelement <16 x i8> %a, i32 0 %conv = sext i8 %vecext to i64 @@ -38,11 +50,17 @@ entry: ret <2 x i64> %vecinit3 } -define <4 x i32> @vextsh2w(<8 x i16> %a) { -; PWR9-LABEL: vextsh2w: -; PWR9: # BB#0: # %entry -; PWR9-NEXT: vextsh2w 2, 2 -; PWR9-NEXT: blr +define <4 x i32> @vextsh2wLE(<8 x i16> %a) { +; CHECK-LE-LABEL: vextsh2wLE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vextsh2w 2, 2 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: vextsh2wLE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE: vperm 2, 2, 2, 3 +; CHECK-BE-NEXT: vextsh2w 2, 2 +; CHECK-BE-NEXT: blr + entry: %vecext = extractelement <8 x i16> %a, i32 0 %conv = sext i16 %vecext to i32 @@ -59,11 +77,17 @@ entry: ret <4 x i32> %vecinit9 } -define <2 x i64> @vextsh2d(<8 x i16> %a) { -; PWR9-LABEL: vextsh2d: -; PWR9: # BB#0: # %entry -; PWR9-NEXT: vextsh2d 2, 2 -; PWR9-NEXT: blr +define <2 x i64> @vextsh2dLE(<8 x i16> %a) { +; CHECK-LE-LABEL: vextsh2dLE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vextsh2d 2, 2 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: vextsh2dLE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE: vperm 2, 2, 2, 3 +; CHECK-BE-NEXT: vextsh2d 2, 2 +; CHECK-BE-NEXT: blr + entry: %vecext = extractelement <8 x i16> %a, i32 0 %conv = sext i16 %vecext to i64 @@ -74,11 +98,17 @@ entry: ret <2 x i64> %vecinit3 } -define <2 x i64> @vextsw2d(<4 x i32> %a) { -; PWR9-LABEL: vextsw2d: -; PWR9: # BB#0: # %entry -; PWR9-NEXT: vextsw2d 2, 2 -; PWR9-NEXT: blr +define <2 x i64> @vextsw2dLE(<4 x i32> %a) { +; CHECK-LE-LABEL: vextsw2dLE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vextsw2d 2, 2 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: vextsw2dLE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE: vmrgew +; CHECK-BE-NEXT: vextsw2d 2, 2 +; CHECK-BE-NEXT: blr + entry: %vecext = extractelement <4 x i32> %a, i32 0 %conv = sext i32 %vecext to i64 @@ -88,3 +118,170 @@ entry: %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 ret <2 x i64> %vecinit3 } + +define <4 x i32> @vextsb2wBE(<16 x i8> %a) { +; CHECK-BE-LABEL: vextsb2wBE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: vextsb2w 2, 2 +; CHECK-BE-NEXT: blr +; CHECK-LE-LABEL: vextsb2wBE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vsldoi 2, 2, 2, 13 +; CHECK-LE-NEXT: vextsb2w 2, 2 +; CHECK-LE-NEXT: blr +entry: + %vecext = extractelement <16 x i8> %a, i32 3 + %conv = sext i8 %vecext to i32 + %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 + %vecext1 = extractelement <16 x i8> %a, i32 7 + %conv2 = sext i8 %vecext1 to i32 + %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1 + %vecext4 = extractelement <16 x i8> %a, i32 11 + %conv5 = sext i8 %vecext4 to i32 + %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2 + %vecext7 = extractelement <16 x i8> %a, i32 15 + %conv8 = sext i8 %vecext7 to i32 + %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3 + ret <4 x i32> %vecinit9 +} + +define <2 x i64> @vextsb2dBE(<16 x i8> %a) { +; CHECK-BE-LABEL: vextsb2dBE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: vextsb2d 2, 2 +; CHECK-BE-NEXT: blr +; CHECK-LE-LABEL: vextsb2dBE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vsldoi 2, 2, 2, 9 +; CHECK-LE-NEXT: vextsb2d 2, 2 +; CHECK-LE-NEXT: blr +entry: + %vecext = extractelement <16 x i8> %a, i32 7 + %conv = sext i8 %vecext to i64 + %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 + %vecext1 = extractelement <16 x i8> %a, i32 15 + %conv2 = sext i8 %vecext1 to i64 + %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 + ret <2 x i64> %vecinit3 +} + +define <4 x i32> @vextsh2wBE(<8 x i16> %a) { +; CHECK-BE-LABEL: vextsh2wBE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: vextsh2w 2, 2 +; CHECK-BE-NEXT: blr +; CHECK-LE-LABEL: vextsh2wBE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vsldoi 2, 2, 2, 14 +; CHECK-LE-NEXT: vextsh2w 2, 2 +; CHECK-LE-NEXT: blr +entry: + %vecext = extractelement <8 x i16> %a, i32 1 + %conv = sext i16 %vecext to i32 + %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 + %vecext1 = extractelement <8 x i16> %a, i32 3 + %conv2 = sext i16 %vecext1 to i32 + %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1 + %vecext4 = extractelement <8 x i16> %a, i32 5 + %conv5 = sext i16 %vecext4 to i32 + %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2 + %vecext7 = extractelement <8 x i16> %a, i32 7 + %conv8 = sext i16 %vecext7 to i32 + %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3 + ret <4 x i32> %vecinit9 +} + +define <2 x i64> @vextsh2dBE(<8 x i16> %a) { +; CHECK-BE-LABEL: vextsh2dBE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: vextsh2d 2, 2 +; CHECK-BE-NEXT: blr +; CHECK-LE-LABEL: vextsh2dBE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vsldoi 2, 2, 2, 10 +; CHECK-LE-NEXT: vextsh2d 2, 2 +; CHECK-LE-NEXT: blr +entry: + %vecext = extractelement <8 x i16> %a, i32 3 + %conv = sext i16 %vecext to i64 + %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 + %vecext1 = extractelement <8 x i16> %a, i32 7 + %conv2 = sext i16 %vecext1 to i64 + %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 + ret <2 x i64> %vecinit3 +} + +define <2 x i64> @vextsw2dBE(<4 x i32> %a) { +; CHECK-BE-LABEL: vextsw2dBE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: vextsw2d 2, 2 +; CHECK-BE-NEXT: blr +; CHECK-LE-LABEL: vextsw2dBE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vsldoi 2, 2, 2, 12 +; CHECK-LE-NEXT: vextsw2d 2, 2 +; CHECK-LE-NEXT: blr +entry: + %vecext = extractelement <4 x i32> %a, i32 1 + %conv = sext i32 %vecext to i64 + %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 + %vecext1 = extractelement <4 x i32> %a, i32 3 + %conv2 = sext i32 %vecext1 to i64 + %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 + ret <2 x i64> %vecinit3 +} + +define <2 x i64> @vextDiffVectors(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LE-LABEL: vextDiffVectors: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NOT: vextsw2d + +; CHECK-BE-LABEL: vextDiffVectors: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NOT: vextsw2d +entry: + %vecext = extractelement <4 x i32> %a, i32 0 + %conv = sext i32 %vecext to i64 + %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 + %vecext1 = extractelement <4 x i32> %b, i32 2 + %conv2 = sext i32 %vecext1 to i64 + %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 + ret <2 x i64> %vecinit3 +} + +define <8 x i16> @testInvalidExtend(<16 x i8> %a) { +entry: +; CHECK-LE-LABEL: testInvalidExtend: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NOT: vexts + +; CHECK-BE-LABEL: testInvalidExtend: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NOT: vexts + + %vecext = extractelement <16 x i8> %a, i32 0 + %conv = sext i8 %vecext to i16 + %vecinit = insertelement <8 x i16> undef, i16 %conv, i32 0 + %vecext1 = extractelement <16 x i8> %a, i32 2 + %conv2 = sext i8 %vecext1 to i16 + %vecinit3 = insertelement <8 x i16> %vecinit, i16 %conv2, i32 1 + %vecext4 = extractelement <16 x i8> %a, i32 4 + %conv5 = sext i8 %vecext4 to i16 + %vecinit6 = insertelement <8 x i16> %vecinit3, i16 %conv5, i32 2 + %vecext7 = extractelement <16 x i8> %a, i32 6 + %conv8 = sext i8 %vecext7 to i16 + %vecinit9 = insertelement <8 x i16> %vecinit6, i16 %conv8, i32 3 + %vecext10 = extractelement <16 x i8> %a, i32 8 + %conv11 = sext i8 %vecext10 to i16 + %vecinit12 = insertelement <8 x i16> %vecinit9, i16 %conv11, i32 4 + %vecext13 = extractelement <16 x i8> %a, i32 10 + %conv14 = sext i8 %vecext13 to i16 + %vecinit15 = insertelement <8 x i16> %vecinit12, i16 %conv14, i32 5 + %vecext16 = extractelement <16 x i8> %a, i32 12 + %conv17 = sext i8 %vecext16 to i16 + %vecinit18 = insertelement <8 x i16> %vecinit15, i16 %conv17, i32 6 + %vecext19 = extractelement <16 x i8> %a, i32 14 + %conv20 = sext i8 %vecext19 to i16 + %vecinit21 = insertelement <8 x i16> %vecinit18, i16 %conv20, i32 7 + ret <8 x i16> %vecinit21 +} diff --git a/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll b/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll index 67146e40db0e8..5346d8a429fbd 100644 --- a/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll +++ b/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll @@ -321,8 +321,8 @@ entry: ; CHECK: lxsibzx 34, 0, 3 ; CHECK-NEXT: vspltb 2, 2, 7 ; CHECK-BE-LABEL: vecucus -; CHECK-BE: li [[OFFSET:[0-9]+]], 1 -; CHECK-BE-NEXT: lxsibzx 34, 3, [[OFFSET]] +; CHECK-BE: addi [[OFFSET:[0-9]+]], [[OFFSET]], 1 +; CHECK-BE-NEXT: lxsibzx 34, 0, [[OFFSET]] ; CHECK-BE-NEXT: vspltb 2, 2, 7 } @@ -385,8 +385,8 @@ entry: ; CHECK: lxsibzx 34, 0, 3 ; CHECK-NEXT: vspltb 2, 2, 7 ; CHECK-BE-LABEL: vecscus -; CHECK-BE: li [[OFFSET:[0-9]+]], 1 -; CHECK-BE-NEXT: lxsibzx 34, 3, [[OFFSET]] +; CHECK-BE: addi [[OFFSET:[0-9]+]], [[OFFSET]], 1 +; CHECK-BE-NEXT: lxsibzx 34, 0, [[OFFSET]] ; CHECK-BE-NEXT: vspltb 2, 2, 7 } @@ -487,8 +487,8 @@ entry: ; CHECK: lxsibzx 34, 0, 3 ; CHECK-NEXT: vspltb 2, 2, 7 ; CHECK-BE-LABEL: vecucss -; CHECK-BE: li [[OFFSET:[0-9]+]], 1 -; CHECK-BE-NEXT: lxsibzx 34, 3, [[OFFSET]] +; CHECK-BE: addi [[OFFSET:[0-9]+]], [[OFFSET]], 1 +; CHECK-BE-NEXT: lxsibzx 34, 0, [[OFFSET]] ; CHECK-BE-NEXT: vspltb 2, 2, 7 } @@ -540,8 +540,8 @@ entry: ; CHECK: lxsibzx 34, 0, 3 ; CHECK-NEXT: vspltb 2, 2, 7 ; CHECK-BE-LABEL: vecscss -; CHECK-BE: li [[OFFSET:[0-9]+]], 1 -; CHECK-BE-NEXT: lxsibzx 34, 3, [[OFFSET]] +; CHECK-BE: addi [[OFFSET:[0-9]+]], [[OFFSET]], 1 +; CHECK-BE-NEXT: lxsibzx 34, 0, [[OFFSET]] ; CHECK-BE-NEXT: vspltb 2, 2, 7 } diff --git a/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir b/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir new file mode 100644 index 0000000000000..8798fcecfc3b9 --- /dev/null +++ b/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir @@ -0,0 +1,34 @@ +# RUN: llc -verify-machineinstrs -run-pass regallocfast -mtriple s390x-ibm-linux -o - %s | FileCheck %s +--- | + + @g_167 = external global [5 x i64], align 8 + define void @main() local_unnamed_addr { + ret void + } +... +# Make sure the usage of different subregisters on the same virtual register +# does not result in invalid kill flags. +# PR33677 +--- +name: main +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: gr128bit } + - { id: 1, class: gr64bit } + - { id: 2, class: addr64bit } +# CHECK: %r0q = L128 +# CHECK-NEXT: %r0l = COPY %r1l +# Although R0L partially redefines R0Q, it must not mark R0Q as kill +# because R1D is still live through that instruction. +# CHECK-NOT: %r0q<imp-use,kill> +# CHECK-NEXT: %r2d = COPY %r1d +# CHECK-NEXT: LARL +body: | + bb.0: + %0.subreg_hl32 = COPY %0.subreg_l32 + %1 = COPY %0.subreg_l64 + %2 = LARL @g_167 + STC %1.subreg_l32, %2, 8, _ + +... diff --git a/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll b/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll index 9fcc0f5d617b0..5c3800e970930 100644 --- a/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll +++ b/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll @@ -95,15 +95,17 @@ if.end: } ; CHECK-LABEL: diamond2: -; CHECK-BP: itte -; CHECK-BP: streq -; CHECK-BP: ldreq -; CHECK-BP: strne -; CHECK-NOBP: cbz -; CHECK-NOBP: str -; CHECK-NOBP: b -; CHECK-NOBP: str -; CHECK-NOBP: ldr +; CHECK-BP: cbz +; CHECK-BP: str +; CHECK-BP: str +; CHECK-BP: b +; CHECK-BP: str +; CHECK-BP: ldr +; CHECK-NOBP: ittee +; CHECK-NOBP: streq +; CHECK-NOBP: ldreq +; CHECK-NOBP: strne +; CHECK-NOBP: strne define i32 @diamond2(i32 %n, i32 %m, i32* %p, i32* %q) { entry: %tobool = icmp eq i32 %n, 0 @@ -111,6 +113,8 @@ entry: if.then: store i32 %n, i32* %p, align 4 + %arrayidx = getelementptr inbounds i32, i32* %p, i32 2 + store i32 %n, i32* %arrayidx, align 4 br label %if.end if.else: diff --git a/test/CodeGen/WebAssembly/umulo-i64.ll b/test/CodeGen/WebAssembly/umulo-i64.ll new file mode 100644 index 0000000000000..e47c8aa0bb3a9 --- /dev/null +++ b/test/CodeGen/WebAssembly/umulo-i64.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -asm-verbose=false | FileCheck %s +; Test that UMULO works correctly on 64-bit operands. +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-emscripten" + +; CHECK-LABEL: _ZN4core3num21_$LT$impl$u20$u64$GT$15overflowing_mul17h07be88b4cbac028fE: +; CHECK: __multi3 +; Function Attrs: inlinehint +define void @"_ZN4core3num21_$LT$impl$u20$u64$GT$15overflowing_mul17h07be88b4cbac028fE"(i64, i64) unnamed_addr #0 { +start: + %2 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %0, i64 %1) + %3 = extractvalue { i64, i1 } %2, 0 + store i64 %3, i64* undef + unreachable +} + +; Function Attrs: nounwind readnone speculatable +declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) #1 + +attributes #0 = { inlinehint } +attributes #1 = { nounwind readnone speculatable } diff --git a/test/CodeGen/X86/2012-08-16-setcc.ll b/test/CodeGen/X86/2012-08-16-setcc.ll index c03b923cadba2..cba208e62a147 100644 --- a/test/CodeGen/X86/2012-08-16-setcc.ll +++ b/test/CodeGen/X86/2012-08-16-setcc.ll @@ -1,45 +1,53 @@ -; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s ; rdar://12081007 -; CHECK-LABEL: and_1: -; CHECK: andb -; CHECK-NEXT: cmovnel -; CHECK: ret define i32 @and_1(i8 zeroext %a, i8 zeroext %b, i32 %x) { +; CHECK-LABEL: and_1: +; CHECK: # BB#0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: andb %dil, %sil +; CHECK-NEXT: cmovnel %edx, %eax +; CHECK-NEXT: retq %1 = and i8 %b, %a %2 = icmp ne i8 %1, 0 %3 = select i1 %2, i32 %x, i32 0 ret i32 %3 } -; CHECK-LABEL: and_2: -; CHECK: andb -; CHECK-NEXT: setne -; CHECK: ret define zeroext i1 @and_2(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: and_2: +; CHECK: # BB#0: +; CHECK-NEXT: andb %dil, %sil +; CHECK-NEXT: setne %al +; CHECK-NEXT: retq %1 = and i8 %b, %a %2 = icmp ne i8 %1, 0 ret i1 %2 } -; CHECK-LABEL: xor_1: -; CHECK: xorb -; CHECK-NEXT: cmovnel -; CHECK: ret define i32 @xor_1(i8 zeroext %a, i8 zeroext %b, i32 %x) { +; CHECK-LABEL: xor_1: +; CHECK: # BB#0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: xorb %dil, %sil +; CHECK-NEXT: cmovnel %edx, %eax +; CHECK-NEXT: retq %1 = xor i8 %b, %a %2 = icmp ne i8 %1, 0 %3 = select i1 %2, i32 %x, i32 0 ret i32 %3 } -; CHECK-LABEL: xor_2: -; CHECK: xorb -; CHECK-NEXT: setne -; CHECK: ret define zeroext i1 @xor_2(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: xor_2: +; CHECK: # BB#0: +; CHECK-NEXT: xorb %dil, %sil +; CHECK-NEXT: setne %al +; CHECK-NEXT: retq %1 = xor i8 %b, %a %2 = icmp ne i8 %1, 0 ret i1 %2 } + diff --git a/test/CodeGen/X86/GC/badreadproto.ll b/test/CodeGen/X86/GC/badreadproto.ll index 37672f8043574..aad79d75218a5 100644 --- a/test/CodeGen/X86/GC/badreadproto.ll +++ b/test/CodeGen/X86/GC/badreadproto.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 %list = type { i32, %list* } diff --git a/test/CodeGen/X86/GC/badrootproto.ll b/test/CodeGen/X86/GC/badrootproto.ll index ff86d03c646a2..37a3451c2c17e 100644 --- a/test/CodeGen/X86/GC/badrootproto.ll +++ b/test/CodeGen/X86/GC/badrootproto.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 %list = type { i32, %list* } %meta = type opaque diff --git a/test/CodeGen/X86/GC/badwriteproto.ll b/test/CodeGen/X86/GC/badwriteproto.ll index 2544e40f81ff6..62c157477635a 100644 --- a/test/CodeGen/X86/GC/badwriteproto.ll +++ b/test/CodeGen/X86/GC/badwriteproto.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 %list = type { i32, %list* } diff --git a/test/CodeGen/X86/GC/fat.ll b/test/CodeGen/X86/GC/fat.ll index d05ca3da8195a..316a80343e2fb 100644 --- a/test/CodeGen/X86/GC/fat.ll +++ b/test/CodeGen/X86/GC/fat.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 declare void @llvm.gcroot(i8**, i8*) nounwind diff --git a/test/CodeGen/X86/GC/outside.ll b/test/CodeGen/X86/GC/outside.ll index 2968c6917ce14..55eda54537898 100644 --- a/test/CodeGen/X86/GC/outside.ll +++ b/test/CodeGen/X86/GC/outside.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 declare void @llvm.gcroot(i8**, i8*) diff --git a/test/CodeGen/X86/GlobalISel/GV.ll b/test/CodeGen/X86/GlobalISel/GV.ll new file mode 100644 index 0000000000000..44862ab5a96ea --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/GV.ll @@ -0,0 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64 +; RUN: llc -mtriple=x86_64-apple-darwin -global-isel -verify-machineinstrs -relocation-model=pic < %s -o - | FileCheck %s --check-prefix=X64_DARWIN_PIC +; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32 +; RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32ABI + +@g_int = global i32 0, align 4 + +; Function Attrs: noinline nounwind optnone uwtable +define i32* @test_global_ptrv() #3 { +; X64-LABEL: test_global_ptrv: +; X64: # BB#0: # %entry +; X64-NEXT: leaq g_int, %rax +; X64-NEXT: retq +; +; X64_DARWIN_PIC-LABEL: test_global_ptrv: +; X64_DARWIN_PIC: ## BB#0: ## %entry +; X64_DARWIN_PIC-NEXT: leaq _g_int(%rip), %rax +; X64_DARWIN_PIC-NEXT: retq +; +; X32-LABEL: test_global_ptrv: +; X32: # BB#0: # %entry +; X32-NEXT: leal g_int, %eax +; X32-NEXT: retl +; +; X32ABI-LABEL: test_global_ptrv: +; X32ABI: # BB#0: # %entry +; X32ABI-NEXT: leal g_int, %eax +; X32ABI-NEXT: retq +entry: + ret i32* @g_int +} + +; Function Attrs: noinline nounwind optnone uwtable +define i32 @test_global_valv() #3 { +; X64-LABEL: test_global_valv: +; X64: # BB#0: # %entry +; X64-NEXT: leaq g_int, %rax +; X64-NEXT: movl (%rax), %eax +; X64-NEXT: retq +; +; X64_DARWIN_PIC-LABEL: test_global_valv: +; X64_DARWIN_PIC: ## BB#0: ## %entry +; X64_DARWIN_PIC-NEXT: leaq _g_int(%rip), %rax +; X64_DARWIN_PIC-NEXT: movl (%rax), %eax +; X64_DARWIN_PIC-NEXT: retq +; +; X32-LABEL: test_global_valv: +; X32: # BB#0: # %entry +; X32-NEXT: leal g_int, %eax +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: retl +; +; X32ABI-LABEL: test_global_valv: +; X32ABI: # BB#0: # %entry +; X32ABI-NEXT: leal g_int, %eax +; X32ABI-NEXT: movl (%eax), %eax +; X32ABI-NEXT: retq +entry: + %0 = load i32, i32* @g_int, align 4 + ret i32 %0 +} + diff --git a/test/CodeGen/X86/GlobalISel/add-vec.ll b/test/CodeGen/X86/GlobalISel/add-vec.ll index 679a49d733a2f..0ea1cf820c0fa 100644 --- a/test/CodeGen/X86/GlobalISel/add-vec.ll +++ b/test/CodeGen/X86/GlobalISel/add-vec.ll @@ -1,38 +1,41 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=SKX +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=core-avx2 -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=corei7-avx -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 + define <16 x i8> @test_add_v16i8(<16 x i8> %arg1, <16 x i8> %arg2) { -; SKX-LABEL: test_add_v16i8: -; SKX: # BB#0: -; SKX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; SKX-NEXT: retq +; ALL-LABEL: test_add_v16i8: +; ALL: # BB#0: +; ALL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; ALL-NEXT: retq %ret = add <16 x i8> %arg1, %arg2 ret <16 x i8> %ret } define <8 x i16> @test_add_v8i16(<8 x i16> %arg1, <8 x i16> %arg2) { -; SKX-LABEL: test_add_v8i16: -; SKX: # BB#0: -; SKX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; SKX-NEXT: retq +; ALL-LABEL: test_add_v8i16: +; ALL: # BB#0: +; ALL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; ALL-NEXT: retq %ret = add <8 x i16> %arg1, %arg2 ret <8 x i16> %ret } define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) { -; SKX-LABEL: test_add_v4i32: -; SKX: # BB#0: -; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; SKX-NEXT: retq +; ALL-LABEL: test_add_v4i32: +; ALL: # BB#0: +; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: retq %ret = add <4 x i32> %arg1, %arg2 ret <4 x i32> %ret } define <2 x i64> @test_add_v2i64(<2 x i64> %arg1, <2 x i64> %arg2) { -; SKX-LABEL: test_add_v2i64: -; SKX: # BB#0: -; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; SKX-NEXT: retq +; ALL-LABEL: test_add_v2i64: +; ALL: # BB#0: +; ALL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; ALL-NEXT: retq %ret = add <2 x i64> %arg1, %arg2 ret <2 x i64> %ret } @@ -42,6 +45,20 @@ define <32 x i8> @test_add_v32i8(<32 x i8> %arg1, <32 x i8> %arg2) { ; SKX: # BB#0: ; SKX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq +; +; AVX2-LABEL: test_add_v32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX1-LABEL: test_add_v32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq %ret = add <32 x i8> %arg1, %arg2 ret <32 x i8> %ret } @@ -51,6 +68,20 @@ define <16 x i16> @test_add_v16i16(<16 x i16> %arg1, <16 x i16> %arg2) { ; SKX: # BB#0: ; SKX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq +; +; AVX2-LABEL: test_add_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX1-LABEL: test_add_v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq %ret = add <16 x i16> %arg1, %arg2 ret <16 x i16> %ret } @@ -60,6 +91,20 @@ define <8 x i32> @test_add_v8i32(<8 x i32> %arg1, <8 x i32> %arg2) { ; SKX: # BB#0: ; SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq +; +; AVX2-LABEL: test_add_v8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX1-LABEL: test_add_v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq %ret = add <8 x i32> %arg1, %arg2 ret <8 x i32> %ret } @@ -69,6 +114,20 @@ define <4 x i64> @test_add_v4i64(<4 x i64> %arg1, <4 x i64> %arg2) { ; SKX: # BB#0: ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq +; +; AVX2-LABEL: test_add_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX1-LABEL: test_add_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq %ret = add <4 x i64> %arg1, %arg2 ret <4 x i64> %ret } @@ -78,6 +137,26 @@ define <64 x i8> @test_add_v64i8(<64 x i8> %arg1, <64 x i8> %arg2) { ; SKX: # BB#0: ; SKX-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; SKX-NEXT: retq +; +; AVX2-LABEL: test_add_v64i8: +; AVX2: # BB#0: +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX1-LABEL: test_add_v64i8: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vpaddb %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 +; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-NEXT: retq %ret = add <64 x i8> %arg1, %arg2 ret <64 x i8> %ret } @@ -87,6 +166,26 @@ define <32 x i16> @test_add_v32i16(<32 x i16> %arg1, <32 x i16> %arg2) { ; SKX: # BB#0: ; SKX-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; SKX-NEXT: retq +; +; AVX2-LABEL: test_add_v32i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX1-LABEL: test_add_v32i16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vpaddw %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 +; AVX1-NEXT: vpaddw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-NEXT: retq %ret = add <32 x i16> %arg1, %arg2 ret <32 x i16> %ret } @@ -96,6 +195,26 @@ define <16 x i32> @test_add_v16i32(<16 x i32> %arg1, <16 x i32> %arg2) { ; SKX: # BB#0: ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; SKX-NEXT: retq +; +; AVX2-LABEL: test_add_v16i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX1-LABEL: test_add_v16i32: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 +; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-NEXT: retq %ret = add <16 x i32> %arg1, %arg2 ret <16 x i32> %ret } @@ -105,6 +224,26 @@ define <8 x i64> @test_add_v8i64(<8 x i64> %arg1, <8 x i64> %arg2) { ; SKX: # BB#0: ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; SKX-NEXT: retq +; +; AVX2-LABEL: test_add_v8i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX1-LABEL: test_add_v8i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 +; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-NEXT: retq %ret = add <8 x i64> %arg1, %arg2 ret <8 x i64> %ret } diff --git a/test/CodeGen/X86/GlobalISel/constant.ll b/test/CodeGen/X86/GlobalISel/constant.ll index b550bb0bc7be6..5b512f9ce9377 100644 --- a/test/CodeGen/X86/GlobalISel/constant.ll +++ b/test/CodeGen/X86/GlobalISel/constant.ll @@ -51,4 +51,13 @@ define i64 @const_i64_i32() { ret i64 -1 } +define void @main(i32 ** %data) { +; ALL-LABEL: main: +; ALL: # BB#0: +; ALL-NEXT: movq $0, %rax +; ALL-NEXT: movq %rax, (%rdi) +; ALL-NEXT: retq + store i32* null, i32** %data, align 8 + ret void +} diff --git a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll index b08ac062fb4bb..11b03bd561103 100644 --- a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll +++ b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64 -; TODO merge with ext.ll after i64 sext suported on 32bit platform +; TODO merge with ext.ll after i64 sext supported on 32bit platform define i64 @test_zext_i1(i8 %a) { ; X64-LABEL: test_zext_i1: diff --git a/test/CodeGen/X86/GlobalISel/ext.ll b/test/CodeGen/X86/GlobalISel/ext.ll index 392c973c12084..d9a09678cf4bb 100644 --- a/test/CodeGen/X86/GlobalISel/ext.ll +++ b/test/CodeGen/X86/GlobalISel/ext.ll @@ -2,6 +2,42 @@ ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64 ; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32 +define i8 @test_zext_i1toi8(i32 %a) { +; X64-LABEL: test_zext_i1toi8: +; X64: # BB#0: +; X64-NEXT: andb $1, %dil +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_zext_i1toi8: +; X32: # BB#0: +; X32-NEXT: movl 4(%esp), %eax +; X32-NEXT: andb $1, %al +; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; X32-NEXT: retl + %val = trunc i32 %a to i1 + %r = zext i1 %val to i8 + ret i8 %r +} + +define i16 @test_zext_i1toi16(i32 %a) { +; X64-LABEL: test_zext_i1toi16: +; X64: # BB#0: +; X64-NEXT: andw $1, %di +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_zext_i1toi16: +; X32: # BB#0: +; X32-NEXT: movl 4(%esp), %eax +; X32-NEXT: andw $1, %ax +; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; X32-NEXT: retl + %val = trunc i32 %a to i1 + %r = zext i1 %val to i16 + ret i16 %r +} + define i32 @test_zext_i1(i32 %a) { ; X64-LABEL: test_zext_i1: ; X64: # BB#0: diff --git a/test/CodeGen/X86/GlobalISel/legalize-GV.mir b/test/CodeGen/X86/GlobalISel/legalize-GV.mir new file mode 100644 index 0000000000000..7f9971e4c70a4 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/legalize-GV.mir @@ -0,0 +1,31 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 +# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 +--- | + + @g_int = global i32 0, align 4 + + define i32* @test_global_ptrv() { + entry: + ret i32* @g_int + } +... +--- +name: test_global_ptrv +# ALL-LABEL: name: test_global_ptrv +alignment: 4 +legalized: false +regBankSelected: false +# ALL: registers: +# ALL-NEXT: - { id: 0, class: _, preferred-register: '' } +registers: + - { id: 0, class: _, preferred-register: '' } +# ALL: %0(p0) = G_GLOBAL_VALUE @g_int +# ALL-NEXT: %rax = COPY %0(p0) +# ALL-NEXT: RET 0, implicit %rax +body: | + bb.1.entry: + %0(p0) = G_GLOBAL_VALUE @g_int + %rax = COPY %0(p0) + RET 0, implicit %rax + +... diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext.mir b/test/CodeGen/X86/GlobalISel/legalize-ext.mir index c9add0dc4e95c..c86bfd9ee96d3 100644 --- a/test/CodeGen/X86/GlobalISel/legalize-ext.mir +++ b/test/CodeGen/X86/GlobalISel/legalize-ext.mir @@ -1,12 +1,28 @@ # RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --- | - define i32 @test_zext_i1(i8 %a) { + + define i8 @test_zext_i1toi8(i1 %a) { + %r = zext i1 %a to i8 + ret i8 %r + } + + define i16 @test_zext_i1toi16(i1 %a) { + %r = zext i1 %a to i16 + ret i16 %r + } + + define i32 @test_zext_i1(i8 %a) { %val = trunc i8 %a to i1 %r = zext i1 %val to i32 ret i32 %r } + define i16 @test_zext_i8toi16(i8 %val) { + %r = zext i8 %val to i16 + ret i16 %r + } + define i32 @test_zext_i8(i8 %val) { %r = zext i8 %val to i32 ret i32 %r @@ -17,12 +33,27 @@ ret i32 %r } + define i8 @test_sext_i1toi8(i1 %a) { + %r = sext i1 %a to i8 + ret i8 %r + } + + define i16 @test_sext_i1toi16(i1 %a) { + %r = sext i1 %a to i16 + ret i16 %r + } + define i32 @test_sext_i1(i8 %a) { %val = trunc i8 %a to i1 %r = sext i1 %val to i32 ret i32 %r } + define i16 @test_sext_i8toi16(i8 %val) { + %r = sext i8 %val to i16 + ret i16 %r + } + define i32 @test_sext_i8(i8 %val) { %r = sext i8 %val to i32 ret i32 %r @@ -35,6 +66,52 @@ ... --- +name: test_zext_i1toi8 +# ALL-LABEL: name: test_zext_i1toi8 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _, preferred-register: '' } + - { id: 1, class: _, preferred-register: '' } +# ALL: %0(s1) = COPY %edi +# ALL-NEXT: %1(s8) = G_ZEXT %0(s1) +# ALL-NEXT: %al = COPY %1(s8) +# ALL-NEXT: RET 0, implicit %al +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s1) = COPY %edi + %1(s8) = G_ZEXT %0(s1) + %al = COPY %1(s8) + RET 0, implicit %al + +... +--- +name: test_zext_i1toi16 +# ALL-LABEL: name: test_zext_i1toi16 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _, preferred-register: '' } + - { id: 1, class: _, preferred-register: '' } +# ALL: %0(s1) = COPY %edi +# ALL-NEXT: %1(s16) = G_ZEXT %0(s1) +# ALL-NEXT: %ax = COPY %1(s16) +# ALL-NEXT: RET 0, implicit %ax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s1) = COPY %edi + %1(s16) = G_ZEXT %0(s1) + %ax = COPY %1(s16) + RET 0, implicit %ax + +... +--- name: test_zext_i1 # ALL-LABEL: name: test_zext_i1 alignment: 4 @@ -61,6 +138,29 @@ body: | ... --- +name: test_zext_i8toi16 +# ALL-LABEL: name: test_zext_i8toi16 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _, preferred-register: '' } + - { id: 1, class: _, preferred-register: '' } +# ALL: %0(s8) = COPY %edi +# ALL-NEXT: %1(s16) = G_ZEXT %0(s8) +# ALL-NEXT: %ax = COPY %1(s16) +# ALL-NEXT: RET 0, implicit %ax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s8) = COPY %edi + %1(s16) = G_ZEXT %0(s8) + %ax = COPY %1(s16) + RET 0, implicit %ax + +... +--- name: test_zext_i8 # ALL-LABEL: name: test_zext_i8 alignment: 4 @@ -107,6 +207,52 @@ body: | ... --- +name: test_sext_i1toi8 +# ALL-LABEL: name: test_sext_i1toi8 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _, preferred-register: '' } + - { id: 1, class: _, preferred-register: '' } +# ALL: %0(s1) = COPY %edi +# ALL-NEXT: %1(s8) = G_SEXT %0(s1) +# ALL-NEXT: %al = COPY %1(s8) +# ALL-NEXT: RET 0, implicit %al +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s1) = COPY %edi + %1(s8) = G_SEXT %0(s1) + %al = COPY %1(s8) + RET 0, implicit %al + +... +--- +name: test_sext_i1toi16 +# ALL-LABEL: name: test_sext_i1toi16 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _, preferred-register: '' } + - { id: 1, class: _, preferred-register: '' } +# ALL: %0(s1) = COPY %edi +# ALL-NEXT: %1(s16) = G_SEXT %0(s1) +# ALL-NEXT: %ax = COPY %1(s16) +# ALL-NEXT: RET 0, implicit %ax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s1) = COPY %edi + %1(s16) = G_SEXT %0(s1) + %ax = COPY %1(s16) + RET 0, implicit %ax + +... +--- name: test_sext_i1 # ALL-LABEL: name: test_sext_i1 alignment: 4 @@ -133,6 +279,29 @@ body: | ... --- +name: test_sext_i8toi16 +# ALL-LABEL: name: test_sext_i8toi16 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _, preferred-register: '' } + - { id: 1, class: _, preferred-register: '' } +# ALL: %0(s8) = COPY %edi +# ALL-NEXT: %1(s16) = G_SEXT %0(s8) +# ALL-NEXT: %ax = COPY %1(s16) +# ALL-NEXT: RET 0, implicit %ax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s8) = COPY %edi + %1(s16) = G_SEXT %0(s8) + %ax = COPY %1(s16) + RET 0, implicit %ax + +... +--- name: test_sext_i8 # ALL-LABEL: name: test_sext_i8 alignment: 4 diff --git a/test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir b/test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir new file mode 100644 index 0000000000000..60d9fc63c14ad --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir @@ -0,0 +1,110 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 +# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 + +--- | + define void @test_memop_s8tos32() { + ret void + } + + define void @test_memop_s64() { + ret void + } +... +--- +name: test_memop_s8tos32 +# ALL-LABEL: name: test_memop_s8tos32 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _, preferred-register: '' } + - { id: 1, class: _, preferred-register: '' } + - { id: 2, class: _, preferred-register: '' } + - { id: 3, class: _, preferred-register: '' } + - { id: 4, class: _, preferred-register: '' } + - { id: 5, class: _, preferred-register: '' } + - { id: 6, class: _, preferred-register: '' } + - { id: 7, class: _, preferred-register: '' } + - { id: 8, class: _, preferred-register: '' } + - { id: 9, class: _, preferred-register: '' } + - { id: 10, class: _, preferred-register: '' } +# ALL: %0(p0) = IMPLICIT_DEF +# ALL-NEXT: %11(s8) = G_LOAD %0(p0) :: (load 1) +# ALL-NEXT: %9(s1) = G_TRUNC %11(s8) +# ALL-NEXT: %1(s8) = G_LOAD %0(p0) :: (load 1) +# ALL-NEXT: %2(s16) = G_LOAD %0(p0) :: (load 2) +# ALL-NEXT: %3(s32) = G_LOAD %0(p0) :: (load 4) +# ALL-NEXT: %4(p0) = G_LOAD %0(p0) :: (load 8) +# ALL-NEXT: %10(s1) = IMPLICIT_DEF +# ALL-NEXT: %12(s8) = G_ZEXT %10(s1) +# ALL-NEXT: G_STORE %12(s8), %0(p0) :: (store 1) +# ALL-NEXT: %5(s8) = IMPLICIT_DEF +# ALL-NEXT: G_STORE %5(s8), %0(p0) :: (store 1) +# ALL-NEXT: %6(s16) = IMPLICIT_DEF +# ALL-NEXT: G_STORE %6(s16), %0(p0) :: (store 2) +# ALL-NEXT: %7(s32) = IMPLICIT_DEF +# ALL-NEXT: G_STORE %7(s32), %0(p0) :: (store 4) +# ALL-NEXT: %8(p0) = IMPLICIT_DEF +# ALL-NEXT: G_STORE %8(p0), %0(p0) :: (store 8) +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = IMPLICIT_DEF + %9(s1) = G_LOAD %0(p0) :: (load 1) + %1(s8) = G_LOAD %0(p0) :: (load 1) + %2(s16) = G_LOAD %0(p0) :: (load 2) + %3(s32) = G_LOAD %0(p0) :: (load 4) + %4(p0) = G_LOAD %0(p0) :: (load 8) + + %10(s1) = IMPLICIT_DEF + G_STORE %10, %0 :: (store 1) + %5(s8) = IMPLICIT_DEF + G_STORE %5, %0 :: (store 1) + %6(s16) = IMPLICIT_DEF + G_STORE %6, %0 :: (store 2) + %7(s32) = IMPLICIT_DEF + G_STORE %7, %0 :: (store 4) + %8(p0) = IMPLICIT_DEF + G_STORE %8, %0 :: (store 8) +... +--- +name: test_memop_s64 +# ALL-LABEL: name: test_memop_s64 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _, preferred-register: '' } + - { id: 1, class: _, preferred-register: '' } + - { id: 2, class: _, preferred-register: '' } +liveins: +# X64: %0(p0) = IMPLICIT_DEF +# X64-NEXT: %1(s64) = G_LOAD %0(p0) :: (load 8) +# X64-NEXT: %2(s64) = IMPLICIT_DEF +# X64-NEXT: G_STORE %2(s64), %0(p0) :: (store 8) +# +# X32: %0(p0) = IMPLICIT_DEF +# X32-NEXT: %3(s32) = G_LOAD %0(p0) :: (load 8) +# X32-NEXT: %6(s32) = G_CONSTANT i32 4 +# X32-NEXT: %5(p0) = G_GEP %0, %6(s32) +# X32-NEXT: %4(s32) = G_LOAD %5(p0) :: (load 8) +# X32-NEXT: %1(s64) = G_MERGE_VALUES %3(s32), %4(s32) +# X32-NEXT: %2(s64) = IMPLICIT_DEF +# X32-NEXT: %7(s32), %8(s32) = G_UNMERGE_VALUES %2(s64) +# X32-NEXT: G_STORE %7(s32), %0(p0) :: (store 8) +# X32-NEXT: %10(s32) = G_CONSTANT i32 4 +# X32-NEXT: %9(p0) = G_GEP %0, %10(s32) +# X32-NEXT: G_STORE %8(s32), %9(p0) :: (store 8) +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = IMPLICIT_DEF + %1(s64) = G_LOAD %0(p0) :: (load 8) + + %2(s64) = IMPLICIT_DEF + G_STORE %2, %0 :: (store 8) + +... + diff --git a/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll index 2757e6493258c..1c719b1bf74da 100644 --- a/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll +++ b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll @@ -4,6 +4,16 @@ ;TODO merge with x86-64 tests (many operations not suppored yet) +define i1 @test_load_i1(i1 * %p1) { +; ALL-LABEL: test_load_i1: +; ALL: # BB#0: +; ALL-NEXT: movl 4(%esp), %eax +; ALL-NEXT: movb (%eax), %al +; ALL-NEXT: retl + %r = load i1, i1* %p1 + ret i1 %r +} + define i8 @test_load_i8(i8 * %p1) { ; ALL-LABEL: test_load_i8: ; ALL: # BB#0: @@ -34,6 +44,18 @@ define i32 @test_load_i32(i32 * %p1) { ret i32 %r } +define i1 * @test_store_i1(i1 %val, i1 * %p1) { +; ALL-LABEL: test_store_i1: +; ALL: # BB#0: +; ALL-NEXT: movb 4(%esp), %cl +; ALL-NEXT: movl 8(%esp), %eax +; ALL-NEXT: andb $1, %cl +; ALL-NEXT: movb %cl, (%eax) +; ALL-NEXT: retl + store i1 %val, i1* %p1 + ret i1 * %p1; +} + define i8 * @test_store_i8(i8 %val, i8 * %p1) { ; ALL-LABEL: test_store_i8: ; ALL: # BB#0: diff --git a/test/CodeGen/X86/GlobalISel/memop-scalar.ll b/test/CodeGen/X86/GlobalISel/memop-scalar.ll index 2e04b3cf20b37..2097a3b0bfc9f 100644 --- a/test/CodeGen/X86/GlobalISel/memop-scalar.ll +++ b/test/CodeGen/X86/GlobalISel/memop-scalar.ll @@ -2,6 +2,15 @@ ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_FAST ; RUN: llc -mtriple=x86_64-linux-gnu -regbankselect-greedy -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_GREEDY +define i1 @test_load_i1(i1 * %p1) { +; ALL-LABEL: test_load_i1: +; ALL: # BB#0: +; ALL-NEXT: movb (%rdi), %al +; ALL-NEXT: retq + %r = load i1, i1* %p1 + ret i1 %r +} + define i8 @test_load_i8(i8 * %p1) { ; ALL-LABEL: test_load_i8: ; ALL: # BB#0: @@ -70,6 +79,17 @@ define double @test_load_double(double * %p1) { ret double %r } +define i1 * @test_store_i1(i1 %val, i1 * %p1) { +; ALL-LABEL: test_store_i1: +; ALL: # BB#0: +; ALL-NEXT: andb $1, %dil +; ALL-NEXT: movb %dil, (%rsi) +; ALL-NEXT: movq %rsi, %rax +; ALL-NEXT: retq + store i1 %val, i1* %p1 + ret i1 * %p1; +} + define i32 * @test_store_i32(i32 %val, i32 * %p1) { ; ALL-LABEL: test_store_i32: ; ALL: # BB#0: diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir index 3658bc9af957a..95ef15ceb6893 100644 --- a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir +++ b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir @@ -174,6 +174,13 @@ ret i64 %ret } + @g_int = global i32 0, align 4 + + define i32* @test_global_ptrv() { + entry: + ret i32* @g_int + } + ... --- name: test_add_i8 @@ -1084,4 +1091,24 @@ body: | RET 0, implicit %rax ... +--- +name: test_global_ptrv +# CHECK-LABEL: name: test_global_ptrv +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' } +registers: + - { id: 0, class: _, preferred-register: '' } +# CHECK: %0(p0) = G_GLOBAL_VALUE @g_int +# CHECK-NEXT: %rax = COPY %0(p0) +# CHECK-NEXT: RET 0, implicit %rax +body: | + bb.1.entry: + %0(p0) = G_GLOBAL_VALUE @g_int + %rax = COPY %0(p0) + RET 0, implicit %rax + +... diff --git a/test/CodeGen/X86/GlobalISel/select-GV.mir b/test/CodeGen/X86/GlobalISel/select-GV.mir new file mode 100644 index 0000000000000..2f2fd51d99d1d --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-GV.mir @@ -0,0 +1,99 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64ALL --check-prefix=X64 +# RUN: llc -mtriple=x86_64-apple-darwin -relocation-model=pic -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64ALL --check-prefix=X64_DARWIN_PIC +# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32ALL --check-prefix=X32 +# RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32ALL --check-prefix=X32ABI + +--- | + + @g_int = global i32 0, align 4 + + define i32* @test_global_ptrv() { + entry: + ret i32* @g_int + } + + define i32 @test_global_valv() { + entry: + %0 = load i32, i32* @g_int, align 4 + ret i32 %0 + } + +... +--- +name: test_global_ptrv +# CHECK-LABEL: name: test_global_ptrv +alignment: 4 +legalized: true +regBankSelected: true +# X64ALL: registers: +# X64ALL-NEXT: - { id: 0, class: gr64, preferred-register: '' } +# +# X32ALL: registers: +# X32ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' } +registers: + - { id: 0, class: gpr, preferred-register: '' } +# X64: %0 = LEA64r _, 1, _, @g_int, _ +# X64-NEXT: %rax = COPY %0 +# X64-NEXT: RET 0, implicit %rax +# +# X64_DARWIN_PIC: %0 = LEA64r %rip, 1, _, @g_int, _ +# X64_DARWIN_PIC-NEXT: %rax = COPY %0 +# X64_DARWIN_PIC-NEXT: RET 0, implicit %rax +# +# X32: %0 = LEA32r _, 1, _, @g_int, _ +# X32-NEXT: %rax = COPY %0 +# X32-NEXT: RET 0, implicit %rax +# +# X32ABI: %0 = LEA64_32r _, 1, _, @g_int, _ +# X32ABI-NEXT: %rax = COPY %0 +# X32ABI-NEXT: RET 0, implicit %rax +body: | + bb.1.entry: + %0(p0) = G_GLOBAL_VALUE @g_int + %rax = COPY %0(p0) + RET 0, implicit %rax + +... +--- +name: test_global_valv +# CHECK-LABEL: name: test_global_valv +alignment: 4 +legalized: true +regBankSelected: true +# X64ALL: registers: +# X64ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' } +# X64ALL-NEXT: - { id: 1, class: gr64, preferred-register: '' } +# +# X32ALL: registers: +# X32ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' } +# X32ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' } +registers: + - { id: 0, class: gpr, preferred-register: '' } + - { id: 1, class: gpr, preferred-register: '' } +# X64: %1 = LEA64r _, 1, _, @g_int, _ +# X64-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int) +# X64-NEXT: %eax = COPY %0 +# X64-NEXT: RET 0, implicit %eax +# +# X64_DARWIN_PIC: %1 = LEA64r %rip, 1, _, @g_int, _ +# X64_DARWIN_PIC-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int) +# X64_DARWIN_PIC-NEXT: %eax = COPY %0 +# X64_DARWIN_PIC-NEXT: RET 0, implicit %eax +# +# X32: %1 = LEA32r _, 1, _, @g_int, _ +# X32-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int) +# X32-NEXT: %eax = COPY %0 +# X32-NEXT: RET 0, implicit %eax +# +# X32ABI: %1 = LEA64_32r _, 1, _, @g_int, _ +# X32ABI-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int) +# X32ABI-NEXT: %eax = COPY %0 +# X32ABI-NEXT: RET 0, implicit %eax +body: | + bb.1.entry: + %1(p0) = G_GLOBAL_VALUE @g_int + %0(s32) = G_LOAD %1(p0) :: (load 4 from @g_int) + %eax = COPY %0(s32) + RET 0, implicit %eax + +... diff --git a/test/CodeGen/X86/GlobalISel/select-constant.mir b/test/CodeGen/X86/GlobalISel/select-constant.mir index 4b91b5f9f0982..30f57418b4ce0 100644 --- a/test/CodeGen/X86/GlobalISel/select-constant.mir +++ b/test/CodeGen/X86/GlobalISel/select-constant.mir @@ -29,6 +29,11 @@ ret i64 -1 } + define void @main(i32** %data) { + store i32* null, i32** %data, align 8 + ret void + } + ... --- name: const_i8 @@ -162,3 +167,29 @@ body: | RET 0, implicit %rax ... +--- +name: main +# CHECK-LABEL: name: main +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr64, preferred-register: '' } +# CHECK-NEXT: - { id: 1, class: gr64, preferred-register: '' } +registers: + - { id: 0, class: gpr, preferred-register: '' } + - { id: 1, class: gpr, preferred-register: '' } +# CHECK: %0 = COPY %rdi +# CHECK-NEXT: %1 = MOV64ri32 0 +# CHECK-NEXT: MOV64mr %0, 1, _, 0, _, %1 :: (store 8 into %ir.data) +# CHECK-NEXT: RET 0 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(p0) = G_CONSTANT i64 0 + G_STORE %1(p0), %0(p0) :: (store 8 into %ir.data) + RET 0 + +... diff --git a/test/CodeGen/X86/GlobalISel/select-ext.mir b/test/CodeGen/X86/GlobalISel/select-ext.mir index b52f1f6fa621e..b6734e5aa2b83 100644 --- a/test/CodeGen/X86/GlobalISel/select-ext.mir +++ b/test/CodeGen/X86/GlobalISel/select-ext.mir @@ -2,6 +2,16 @@ # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --- | + define i8 @test_zext_i1toi8(i1 %a) { + %r = zext i1 %a to i8 + ret i8 %r + } + + define i16 @test_zext_i1toi16(i1 %a) { + %r = zext i1 %a to i16 + ret i16 %r + } + define i32 @test_zext_i1(i1 %a) { %r = zext i1 %a to i32 ret i32 %r @@ -29,6 +39,60 @@ ... --- +name: test_zext_i1toi8 +# ALL-LABEL: name: test_zext_i1toi8 +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' } +# ALL-NEXT: - { id: 1, class: gr8, preferred-register: '' } +registers: + - { id: 0, class: gpr, preferred-register: '' } + - { id: 1, class: gpr, preferred-register: '' } +# ALL: %0 = COPY %dil +# ALL-NEXT: %1 = AND8ri %0, 1, implicit-def %eflags +# ALL-NEXT: %al = COPY %1 +# ALL-NEXT: RET 0, implicit %al +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s1) = COPY %edi + %1(s8) = G_ZEXT %0(s1) + %al = COPY %1(s8) + RET 0, implicit %al + +... +--- +name: test_zext_i1toi16 +# ALL-LABEL: name: test_zext_i1toi16 +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' } +# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '' } +# ALL-NEXT: - { id: 2, class: gr16, preferred-register: '' } +registers: + - { id: 0, class: gpr, preferred-register: '' } + - { id: 1, class: gpr, preferred-register: '' } +# ALL: %0 = COPY %dil +# ALL-NEXT: %2 = SUBREG_TO_REG 0, %0, 1 +# ALL-NEXT: %1 = AND16ri8 %2, 1, implicit-def %eflags +# ALL-NEXT: %ax = COPY %1 +# ALL-NEXT: RET 0, implicit %ax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s1) = COPY %edi + %1(s16) = G_ZEXT %0(s1) + %ax = COPY %1(s16) + RET 0, implicit %ax + +... +--- name: test_zext_i1 # ALL-LABEL: name: test_zext_i1 alignment: 4 diff --git a/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir b/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir new file mode 100644 index 0000000000000..09dc5344796f9 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir @@ -0,0 +1,53 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=AVX +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=AVX512VL +--- | + define void @test_unmerge() { + ret void + } + +... +--- +name: test_unmerge +# AVX-LABEL: name: test_unmerge +# +# AVX512VL-LABEL: name: test_unmerge +alignment: 4 +legalized: true +regBankSelected: true +# AVX: registers: +# AVX-NEXT: - { id: 0, class: vr256, preferred-register: '' } +# AVX-NEXT: - { id: 1, class: vr128, preferred-register: '' } +# AVX-NEXT: - { id: 2, class: vr128, preferred-register: '' } +# +# AVX512VL: registers: +# AVX512VL-NEXT: - { id: 0, class: vr256x, preferred-register: '' } +# AVX512VL-NEXT: - { id: 1, class: vr128x, preferred-register: '' } +# AVX512VL-NEXT: - { id: 2, class: vr128x, preferred-register: '' } +registers: + - { id: 0, class: vecr } + - { id: 1, class: vecr } + - { id: 2, class: vecr } +# AVX: %0 = IMPLICIT_DEF +# AVX-NEXT: %1 = COPY %0.sub_xmm +# AVX-NEXT: %2 = VEXTRACTF128rr %0, 1 +# AVX-NEXT: %xmm0 = COPY %1 +# AVX-NEXT: %xmm1 = COPY %2 +# AVX-NEXT: RET 0, implicit %xmm0, implicit %xmm1 +# +# AVX512VL: %0 = IMPLICIT_DEF +# AVX512VL-NEXT: %1 = COPY %0.sub_xmm +# AVX512VL-NEXT: %2 = VEXTRACTF32x4Z256rr %0, 1 +# AVX512VL-NEXT: %xmm0 = COPY %1 +# AVX512VL-NEXT: %xmm1 = COPY %2 +# AVX512VL-NEXT: RET 0, implicit %xmm0, implicit %xmm1 +body: | + bb.1 (%ir-block.0): + + %0(<8 x s32>) = IMPLICIT_DEF + %1(<4 x s32>), %2(<4 x s32>) = G_UNMERGE_VALUES %0(<8 x s32>) + %xmm0 = COPY %1(<4 x s32>) + %xmm1 = COPY %2(<4 x s32>) + RET 0, implicit %xmm0, implicit %xmm1 + +... + diff --git a/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir b/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir new file mode 100644 index 0000000000000..a63733d07f6a6 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir @@ -0,0 +1,74 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL +--- | + define void @test_unmerge_v128() { + ret void + } + + define void @test_unmerge_v256() { + ret void + } + +... +--- +name: test_unmerge_v128 +# ALL-LABEL: name: test_unmerge_v128 +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' } +# ALL-NEXT: - { id: 1, class: vr128x, preferred-register: '' } +# ALL-NEXT: - { id: 2, class: vr128x, preferred-register: '' } +# ALL-NEXT: - { id: 3, class: vr128x, preferred-register: '' } +# ALL-NEXT: - { id: 4, class: vr128x, preferred-register: '' } +registers: + - { id: 0, class: vecr } + - { id: 1, class: vecr } + - { id: 2, class: vecr } + - { id: 3, class: vecr } + - { id: 4, class: vecr } +# ALL: %0 = IMPLICIT_DEF +# ALL-NEXT: %1 = COPY %0.sub_xmm +# ALL-NEXT: %2 = VEXTRACTF32x4Zrr %0, 1 +# ALL-NEXT: %3 = VEXTRACTF32x4Zrr %0, 2 +# ALL-NEXT: %4 = VEXTRACTF32x4Zrr %0, 3 +# ALL-NEXT: %xmm0 = COPY %1 +# ALL-NEXT: RET 0, implicit %xmm0 +body: | + bb.1 (%ir-block.0): + + %0(<16 x s32>) = IMPLICIT_DEF + %1(<4 x s32>), %2(<4 x s32>), %3(<4 x s32>), %4(<4 x s32>) = G_UNMERGE_VALUES %0(<16 x s32>) + %xmm0 = COPY %1(<4 x s32>) + RET 0, implicit %xmm0 + +... +--- +name: test_unmerge_v256 +# ALL-LABEL: name: test_unmerge_v256 +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' } +# ALL-NEXT: - { id: 1, class: vr256x, preferred-register: '' } +# ALL-NEXT: - { id: 2, class: vr256x, preferred-register: '' } +registers: + - { id: 0, class: vecr } + - { id: 1, class: vecr } + - { id: 2, class: vecr } +# ALL: %0 = IMPLICIT_DEF +# ALL-NEXT: %1 = COPY %0.sub_ymm +# ALL-NEXT: %2 = VEXTRACTF64x4Zrr %0, 1 +# ALL-NEXT: %xmm0 = COPY %1 +# ALL-NEXT: RET 0, implicit %ymm0 +body: | + bb.1 (%ir-block.0): + + %0(<16 x s32>) = IMPLICIT_DEF + %1(<8 x s32>), %2(<8 x s32>) = G_UNMERGE_VALUES %0(<16 x s32>) + %xmm0 = COPY %1(<8 x s32>) + RET 0, implicit %ymm0 + +... + diff --git a/test/CodeGen/X86/GlobalISel/x86_64-fallback.ll b/test/CodeGen/X86/GlobalISel/x86_64-fallback.ll new file mode 100644 index 0000000000000..2743f882b2e41 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/x86_64-fallback.ll @@ -0,0 +1,18 @@ +; RUN: llc -O0 -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o %t.out 2> %t.err +; RUN: FileCheck %s --check-prefix=FALLBACK-WITH-REPORT-OUT < %t.out +; RUN: FileCheck %s --check-prefix=FALLBACK-WITH-REPORT-ERR < %t.err +; This file checks that the fallback path to selection dag works. +; The test is fragile in the sense that it must be updated to expose +; something that fails with global-isel. +; When we cannot produce a test case anymore, that means we can remove +; the fallback path. + +; Check that we fallback on invoke translation failures. +; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg1<def>(s80) = G_FCONSTANT x86_fp80 0xK4002A000000000000000 +; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for test_x86_fp80_dump +; FALLBACK-WITH-REPORT-OUT-LABEL: test_x86_fp80_dump: +define void @test_x86_fp80_dump(x86_fp80* %ptr){ + store x86_fp80 0xK4002A000000000000000, x86_fp80* %ptr, align 16 + ret void +} + diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index e5f7cc5c6dd8d..640b5215afe9f 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -2624,7 +2624,8 @@ define void @avg_v64i8_const(<64 x i8>* %a) { ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 @@ -2941,7 +2942,8 @@ define void @avg_v32i16_const(<32 x i16>* %a) { ; AVX512F: # BB#0: ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 diff --git a/test/CodeGen/X86/avx-cmp.ll b/test/CodeGen/X86/avx-cmp.ll index a050d6abe56f9..963878b0f5632 100644 --- a/test/CodeGen/X86/avx-cmp.ll +++ b/test/CodeGen/X86/avx-cmp.ll @@ -1,25 +1,59 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: vcmpltps %ymm -; CHECK-NOT: vucomiss -define <8 x i32> @cmp00(<8 x float> %a, <8 x float> %b) nounwind readnone { +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s + +define <8 x i32> @cmp00(<8 x float> %a, <8 x float> %b) nounwind { +; CHECK-LABEL: cmp00: +; CHECK: # BB#0: +; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq %bincmp = fcmp olt <8 x float> %a, %b %s = sext <8 x i1> %bincmp to <8 x i32> ret <8 x i32> %s } -; CHECK: vcmpltpd %ymm -; CHECK-NOT: vucomisd -define <4 x i64> @cmp01(<4 x double> %a, <4 x double> %b) nounwind readnone { +define <4 x i64> @cmp01(<4 x double> %a, <4 x double> %b) nounwind { +; CHECK-LABEL: cmp01: +; CHECK: # BB#0: +; CHECK-NEXT: vcmpltpd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq %bincmp = fcmp olt <4 x double> %a, %b %s = sext <4 x i1> %bincmp to <4 x i64> ret <4 x i64> %s } -declare void @scale() nounwind uwtable - -; CHECK: vucomisd -define void @render() nounwind uwtable { +declare void @scale() nounwind + +define void @render() nounwind { +; CHECK-LABEL: render: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: jne .LBB2_6 +; CHECK-NEXT: # BB#1: # %for.cond5.preheader +; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: jmp .LBB2_2 +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB2_5: # %if.then +; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 +; CHECK-NEXT: callq scale +; CHECK-NEXT: .LBB2_2: # %for.cond5 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: jne .LBB2_2 +; CHECK-NEXT: # BB#3: # %for.cond5 +; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 +; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: je .LBB2_2 +; CHECK-NEXT: # BB#4: # %for.body33 +; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 +; CHECK-NEXT: vucomisd {{\.LCPI.*}}, %xmm0 +; CHECK-NEXT: jne .LBB2_5 +; CHECK-NEXT: jp .LBB2_5 +; CHECK-NEXT: jmp .LBB2_2 +; CHECK-NEXT: .LBB2_6: # %for.end52 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq entry: br i1 undef, label %for.cond5, label %for.end52 @@ -42,89 +76,113 @@ for.end52: ret void } -; CHECK: vextractf128 $1 -; CHECK: vextractf128 $1 -; CHECK-NEXT: vpcmpgtd %xmm -; CHECK-NEXT: vpcmpgtd %xmm -; CHECK-NEXT: vinsertf128 $1 -define <8 x i32> @int256-cmp(<8 x i32> %i, <8 x i32> %j) nounwind readnone { +define <8 x i32> @int256_cmp(<8 x i32> %i, <8 x i32> %j) nounwind { +; CHECK-LABEL: int256_cmp: +; CHECK: # BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3 +; CHECK-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: retq %bincmp = icmp slt <8 x i32> %i, %j %x = sext <8 x i1> %bincmp to <8 x i32> ret <8 x i32> %x } -; CHECK: vextractf128 $1 -; CHECK: vextractf128 $1 -; CHECK-NEXT: vpcmpgtq %xmm -; CHECK-NEXT: vpcmpgtq %xmm -; CHECK-NEXT: vinsertf128 $1 -define <4 x i64> @v4i64-cmp(<4 x i64> %i, <4 x i64> %j) nounwind readnone { +define <4 x i64> @v4i64_cmp(<4 x i64> %i, <4 x i64> %j) nounwind { +; CHECK-LABEL: v4i64_cmp: +; CHECK: # BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3 +; CHECK-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: retq %bincmp = icmp slt <4 x i64> %i, %j %x = sext <4 x i1> %bincmp to <4 x i64> ret <4 x i64> %x } -; CHECK: vextractf128 $1 -; CHECK: vextractf128 $1 -; CHECK-NEXT: vpcmpgtw %xmm -; CHECK-NEXT: vpcmpgtw %xmm -; CHECK-NEXT: vinsertf128 $1 -define <16 x i16> @v16i16-cmp(<16 x i16> %i, <16 x i16> %j) nounwind readnone { +define <16 x i16> @v16i16_cmp(<16 x i16> %i, <16 x i16> %j) nounwind { +; CHECK-LABEL: v16i16_cmp: +; CHECK: # BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3 +; CHECK-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: retq %bincmp = icmp slt <16 x i16> %i, %j %x = sext <16 x i1> %bincmp to <16 x i16> ret <16 x i16> %x } -; CHECK: vextractf128 $1 -; CHECK: vextractf128 $1 -; CHECK-NEXT: vpcmpgtb %xmm -; CHECK-NEXT: vpcmpgtb %xmm -; CHECK-NEXT: vinsertf128 $1 -define <32 x i8> @v32i8-cmp(<32 x i8> %i, <32 x i8> %j) nounwind readnone { +define <32 x i8> @v32i8_cmp(<32 x i8> %i, <32 x i8> %j) nounwind { +; CHECK-LABEL: v32i8_cmp: +; CHECK: # BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3 +; CHECK-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: retq %bincmp = icmp slt <32 x i8> %i, %j %x = sext <32 x i1> %bincmp to <32 x i8> ret <32 x i8> %x } -; CHECK: vextractf128 $1 -; CHECK: vextractf128 $1 -; CHECK-NEXT: vpcmpeqd %xmm -; CHECK-NEXT: vpcmpeqd %xmm -; CHECK-NEXT: vinsertf128 $1 -define <8 x i32> @int256-cmpeq(<8 x i32> %i, <8 x i32> %j) nounwind readnone { +define <8 x i32> @int256_cmpeq(<8 x i32> %i, <8 x i32> %j) nounwind { +; CHECK-LABEL: int256_cmpeq: +; CHECK: # BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: retq %bincmp = icmp eq <8 x i32> %i, %j %x = sext <8 x i1> %bincmp to <8 x i32> ret <8 x i32> %x } -; CHECK: vextractf128 $1 -; CHECK: vextractf128 $1 -; CHECK-NEXT: vpcmpeqq %xmm -; CHECK-NEXT: vpcmpeqq %xmm -; CHECK-NEXT: vinsertf128 $1 -define <4 x i64> @v4i64-cmpeq(<4 x i64> %i, <4 x i64> %j) nounwind readnone { +define <4 x i64> @v4i64_cmpeq(<4 x i64> %i, <4 x i64> %j) nounwind { +; CHECK-LABEL: v4i64_cmpeq: +; CHECK: # BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: retq %bincmp = icmp eq <4 x i64> %i, %j %x = sext <4 x i1> %bincmp to <4 x i64> ret <4 x i64> %x } -; CHECK: vextractf128 $1 -; CHECK: vextractf128 $1 -; CHECK-NEXT: vpcmpeqw %xmm -; CHECK-NEXT: vpcmpeqw %xmm -; CHECK-NEXT: vinsertf128 $1 -define <16 x i16> @v16i16-cmpeq(<16 x i16> %i, <16 x i16> %j) nounwind readnone { +define <16 x i16> @v16i16_cmpeq(<16 x i16> %i, <16 x i16> %j) nounwind { +; CHECK-LABEL: v16i16_cmpeq: +; CHECK: # BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpcmpeqw %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: retq %bincmp = icmp eq <16 x i16> %i, %j %x = sext <16 x i1> %bincmp to <16 x i16> ret <16 x i16> %x } -; CHECK: vextractf128 $1 -; CHECK: vextractf128 $1 -; CHECK-NEXT: vpcmpeqb %xmm -; CHECK-NEXT: vpcmpeqb %xmm -; CHECK-NEXT: vinsertf128 $1 -define <32 x i8> @v32i8-cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind readnone { +define <32 x i8> @v32i8_cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind { +; CHECK-LABEL: v32i8_cmpeq: +; CHECK: # BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: retq %bincmp = icmp eq <32 x i8> %i, %j %x = sext <32 x i1> %bincmp to <32 x i8> ret <32 x i8> %x @@ -132,17 +190,28 @@ define <32 x i8> @v32i8-cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind readnone { ;; Scalar comparison -; CHECK: scalarcmpA -; CHECK: vcmpeqsd define i32 @scalarcmpA() uwtable ssp { +; CHECK-LABEL: scalarcmpA: +; CHECK: # BB#0: +; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcmpeqsd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovq %xmm0, %rax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill> +; CHECK-NEXT: retq %cmp29 = fcmp oeq double undef, 0.000000e+00 %res = zext i1 %cmp29 to i32 ret i32 %res } -; CHECK: scalarcmpB -; CHECK: vcmpeqss define i32 @scalarcmpB() uwtable ssp { +; CHECK-LABEL: scalarcmpB: +; CHECK: # BB#0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcmpeqss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: retq %cmp29 = fcmp oeq float undef, 0.000000e+00 %res = zext i1 %cmp29 to i32 ret i32 %res diff --git a/test/CodeGen/X86/avx-load-store.ll b/test/CodeGen/X86/avx-load-store.ll index d7eceb7cce664..06aadc476e4ca 100644 --- a/test/CodeGen/X86/avx-load-store.ll +++ b/test/CodeGen/X86/avx-load-store.ll @@ -1,13 +1,62 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s -; RUN: llc -O0 < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s -check-prefix=CHECK_O0 - -; CHECK: vmovaps -; CHECK: vmovaps -; CHECK: vmovaps -; CHECK: vmovaps -; CHECK: vmovaps -; CHECK: vmovaps -define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>* nocapture %i) nounwind uwtable ssp { +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,slow-unaligned-mem-32 | FileCheck %s +; RUN: llc -O0 < %s -mtriple=x86_64-unknown-unknown -mattr=avx,slow-unaligned-mem-32 | FileCheck %s -check-prefix=CHECK_O0 + +define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>* nocapture %i) nounwind { +; CHECK-LABEL: test_256_load: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $96, %rsp +; CHECK-NEXT: movq %rdx, %r14 +; CHECK-NEXT: movq %rsi, %r15 +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: vmovaps (%rbx), %ymm0 +; CHECK-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill +; CHECK-NEXT: vmovaps (%r15), %ymm1 +; CHECK-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill +; CHECK-NEXT: vmovaps (%r14), %ymm2 +; CHECK-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; CHECK-NEXT: callq dummy +; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovaps %ymm0, (%rbx) +; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovaps %ymm0, (%r15) +; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; CHECK-NEXT: vmovaps %ymm0, (%r14) +; CHECK-NEXT: addq $96, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; CHECK_O0-LABEL: test_256_load: +; CHECK_O0: # BB#0: # %entry +; CHECK_O0-NEXT: subq $152, %rsp +; CHECK_O0-NEXT: vmovapd (%rdi), %ymm0 +; CHECK_O0-NEXT: vmovaps (%rsi), %ymm1 +; CHECK_O0-NEXT: vmovdqa (%rdx), %ymm2 +; CHECK_O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill +; CHECK_O0-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill +; CHECK_O0-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp) # 32-byte Spill +; CHECK_O0-NEXT: movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK_O0-NEXT: movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK_O0-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; CHECK_O0-NEXT: callq dummy +; CHECK_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx # 8-byte Reload +; CHECK_O0-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; CHECK_O0-NEXT: vmovapd %ymm0, (%rdx) +; CHECK_O0-NEXT: movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload +; CHECK_O0-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 # 32-byte Reload +; CHECK_O0-NEXT: vmovaps %ymm1, (%rsi) +; CHECK_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload +; CHECK_O0-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm2 # 32-byte Reload +; CHECK_O0-NEXT: vmovdqa %ymm2, (%rdi) +; CHECK_O0-NEXT: addq $152, %rsp +; CHECK_O0-NEXT: vzeroupper +; CHECK_O0-NEXT: retq entry: %0 = bitcast double* %d to <4 x double>* %tmp1.i = load <4 x double>, <4 x double>* %0, align 32 @@ -27,62 +76,115 @@ declare void @dummy(<4 x double>, <8 x float>, <4 x i64>) ;; The two tests below check that we must fold load + scalar_to_vector ;; + ins_subvec+ zext into only a single vmovss or vmovsd or vinsertps from memory -; CHECK: mov00 define <8 x float> @mov00(<8 x float> %v, float * %ptr) nounwind { +; CHECK-LABEL: mov00: +; CHECK: # BB#0: +; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: retq +; +; CHECK_O0-LABEL: mov00: +; CHECK_O0: # BB#0: +; CHECK_O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK_O0-NEXT: # implicit-def: %YMM1 +; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1 +; CHECK_O0-NEXT: vxorps %ymm2, %ymm2, %ymm2 +; CHECK_O0-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1,2,3,4,5,6,7] +; CHECK_O0-NEXT: retq %val = load float, float* %ptr -; CHECK: vmovss (% %i0 = insertelement <8 x float> zeroinitializer, float %val, i32 0 ret <8 x float> %i0 -; CHECK: ret } -; CHECK: mov01 define <4 x double> @mov01(<4 x double> %v, double * %ptr) nounwind { +; CHECK-LABEL: mov01: +; CHECK: # BB#0: +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: retq +; +; CHECK_O0-LABEL: mov01: +; CHECK_O0: # BB#0: +; CHECK_O0-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK_O0-NEXT: # implicit-def: %YMM1 +; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1 +; CHECK_O0-NEXT: vxorps %ymm2, %ymm2, %ymm2 +; CHECK_O0-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm2[1,2,3] +; CHECK_O0-NEXT: retq %val = load double, double* %ptr -; CHECK: vmovsd (% %i0 = insertelement <4 x double> zeroinitializer, double %val, i32 0 ret <4 x double> %i0 -; CHECK: ret } -; CHECK: vmovaps %ymm define void @storev16i16(<16 x i16> %a) nounwind { +; CHECK-LABEL: storev16i16: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps %ymm0, (%rax) +; +; CHECK_O0-LABEL: storev16i16: +; CHECK_O0: # BB#0: +; CHECK_O0-NEXT: # implicit-def: %RAX +; CHECK_O0-NEXT: vmovdqa %ymm0, (%rax) store <16 x i16> %a, <16 x i16>* undef, align 32 unreachable } -; CHECK: storev16i16_01 -; CHECK: vextractf128 -; CHECK: vmovups %xmm define void @storev16i16_01(<16 x i16> %a) nounwind { +; CHECK-LABEL: storev16i16_01: +; CHECK: # BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm0, (%rax) +; CHECK-NEXT: vmovups %xmm0, (%rax) +; +; CHECK_O0-LABEL: storev16i16_01: +; CHECK_O0: # BB#0: +; CHECK_O0-NEXT: # implicit-def: %RAX +; CHECK_O0-NEXT: vmovdqu %ymm0, (%rax) store <16 x i16> %a, <16 x i16>* undef, align 4 unreachable } -; CHECK: storev32i8 -; CHECK: vmovaps %ymm define void @storev32i8(<32 x i8> %a) nounwind { +; CHECK-LABEL: storev32i8: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps %ymm0, (%rax) +; +; CHECK_O0-LABEL: storev32i8: +; CHECK_O0: # BB#0: +; CHECK_O0-NEXT: # implicit-def: %RAX +; CHECK_O0-NEXT: vmovdqa %ymm0, (%rax) store <32 x i8> %a, <32 x i8>* undef, align 32 unreachable } -; CHECK: storev32i8_01 -; CHECK: vextractf128 -; CHECK: vmovups %xmm define void @storev32i8_01(<32 x i8> %a) nounwind { +; CHECK-LABEL: storev32i8_01: +; CHECK: # BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm0, (%rax) +; CHECK-NEXT: vmovups %xmm0, (%rax) +; +; CHECK_O0-LABEL: storev32i8_01: +; CHECK_O0: # BB#0: +; CHECK_O0-NEXT: # implicit-def: %RAX +; CHECK_O0-NEXT: vmovdqu %ymm0, (%rax) store <32 x i8> %a, <32 x i8>* undef, align 4 unreachable } ; It is faster to make two saves, if the data is already in XMM registers. For ; example, after making an integer operation. -; CHECK: _double_save -; CHECK-NOT: vinsertf128 $1 -; CHECK-NOT: vinsertf128 $0 -; CHECK: vmovaps %xmm -; CHECK: vmovaps %xmm define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp { -entry: +; CHECK-LABEL: double_save: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps %xmm1, 16(%rdi) +; CHECK-NEXT: vmovaps %xmm0, (%rdi) +; CHECK-NEXT: retq +; +; CHECK_O0-LABEL: double_save: +; CHECK_O0: # BB#0: +; CHECK_O0-NEXT: # implicit-def: %YMM2 +; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2 +; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi) +; CHECK_O0-NEXT: vzeroupper +; CHECK_O0-NEXT: retq %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> store <8 x i32> %Z, <8 x i32>* %P, align 16 ret void @@ -90,60 +192,127 @@ entry: declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind -; CHECK_O0: _f_f -; CHECK-O0: vmovss LCPI -; CHECK-O0: vxorps %xmm -; CHECK-O0: vmovss %xmm define void @f_f() nounwind { +; CHECK-LABEL: f_f: +; CHECK: # BB#0: # %allocas +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: jne .LBB8_2 +; CHECK-NEXT: # BB#1: # %cif_mask_all +; CHECK-NEXT: .LBB8_2: # %cif_mask_mixed +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: jne .LBB8_4 +; CHECK-NEXT: # BB#3: # %cif_mixed_test_all +; CHECK-NEXT: movl $-1, %eax +; CHECK-NEXT: vmovd %eax, %xmm0 +; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax) +; CHECK-NEXT: .LBB8_4: # %cif_mixed_test_any_check +; +; CHECK_O0-LABEL: f_f: +; CHECK_O0: # BB#0: # %allocas +; CHECK_O0-NEXT: # implicit-def: %AL +; CHECK_O0-NEXT: testb $1, %al +; CHECK_O0-NEXT: jne .LBB8_1 +; CHECK_O0-NEXT: jmp .LBB8_2 +; CHECK_O0-NEXT: .LBB8_1: # %cif_mask_all +; CHECK_O0-NEXT: .LBB8_2: # %cif_mask_mixed +; CHECK_O0-NEXT: # implicit-def: %AL +; CHECK_O0-NEXT: testb $1, %al +; CHECK_O0-NEXT: jne .LBB8_3 +; CHECK_O0-NEXT: jmp .LBB8_4 +; CHECK_O0-NEXT: .LBB8_3: # %cif_mixed_test_all +; CHECK_O0-NEXT: movl $-1, %eax +; CHECK_O0-NEXT: vmovd %eax, %xmm0 +; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1 +; CHECK_O0-NEXT: # implicit-def: %RCX +; CHECK_O0-NEXT: # implicit-def: %YMM2 +; CHECK_O0-NEXT: vmaskmovps %ymm2, %ymm1, (%rcx) +; CHECK_O0-NEXT: .LBB8_4: # %cif_mixed_test_any_check allocas: br i1 undef, label %cif_mask_all, label %cif_mask_mixed -cif_mask_all: ; preds = %allocas +cif_mask_all: unreachable -cif_mask_mixed: ; preds = %allocas +cif_mask_mixed: br i1 undef, label %cif_mixed_test_all, label %cif_mixed_test_any_check -cif_mixed_test_all: ; preds = %cif_mask_mixed +cif_mixed_test_all: call void @llvm.x86.avx.maskstore.ps.256(i8* undef, <8 x i32> <i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <8 x float> undef) nounwind unreachable -cif_mixed_test_any_check: ; preds = %cif_mask_mixed +cif_mixed_test_any_check: unreachable } -; CHECK: add8i32 -; CHECK: vmovups -; CHECK: vmovups -; CHECK-NOT: vinsertf128 -; CHECK-NOT: vextractf128 -; CHECK: vmovups -; CHECK: vmovups define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind { +; CHECK-LABEL: add8i32: +; CHECK: # BB#0: +; CHECK-NEXT: vmovups (%rsi), %xmm0 +; CHECK-NEXT: vmovups 16(%rsi), %xmm1 +; CHECK-NEXT: vmovups %xmm1, 16(%rdi) +; CHECK-NEXT: vmovups %xmm0, (%rdi) +; CHECK-NEXT: retq +; +; CHECK_O0-LABEL: add8i32: +; CHECK_O0: # BB#0: +; CHECK_O0-NEXT: vmovdqu (%rsi), %xmm0 +; CHECK_O0-NEXT: vmovdqu 16(%rsi), %xmm1 +; CHECK_O0-NEXT: # implicit-def: %YMM2 +; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2 +; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi) +; CHECK_O0-NEXT: vzeroupper +; CHECK_O0-NEXT: retq %b = load <8 x i32>, <8 x i32>* %bp, align 1 %x = add <8 x i32> zeroinitializer, %b store <8 x i32> %x, <8 x i32>* %ret, align 1 ret void } -; CHECK: add4i64a64 -; CHECK: vmovaps ({{.*}}), %ymm{{.*}} -; CHECK: vmovaps %ymm{{.*}}, ({{.*}}) define void @add4i64a64(<4 x i64>* %ret, <4 x i64>* %bp) nounwind { +; CHECK-LABEL: add4i64a64: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rsi), %ymm0 +; CHECK-NEXT: vmovaps %ymm0, (%rdi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +; +; CHECK_O0-LABEL: add4i64a64: +; CHECK_O0: # BB#0: +; CHECK_O0-NEXT: vmovaps (%rsi), %ymm0 +; CHECK_O0-NEXT: vmovdqa %ymm0, (%rdi) +; CHECK_O0-NEXT: vzeroupper +; CHECK_O0-NEXT: retq %b = load <4 x i64>, <4 x i64>* %bp, align 64 %x = add <4 x i64> zeroinitializer, %b store <4 x i64> %x, <4 x i64>* %ret, align 64 ret void } -; CHECK: add4i64a16 -; CHECK: vmovaps {{.*}}({{.*}}), %xmm{{.*}} -; CHECK: vmovaps {{.*}}({{.*}}), %xmm{{.*}} -; CHECK: vmovaps %xmm{{.*}}, {{.*}}({{.*}}) -; CHECK: vmovaps %xmm{{.*}}, {{.*}}({{.*}}) define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind { +; CHECK-LABEL: add4i64a16: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rsi), %xmm0 +; CHECK-NEXT: vmovaps 16(%rsi), %xmm1 +; CHECK-NEXT: vmovaps %xmm1, 16(%rdi) +; CHECK-NEXT: vmovaps %xmm0, (%rdi) +; CHECK-NEXT: retq +; +; CHECK_O0-LABEL: add4i64a16: +; CHECK_O0: # BB#0: +; CHECK_O0-NEXT: vmovdqa (%rsi), %xmm0 +; CHECK_O0-NEXT: vmovdqa 16(%rsi), %xmm1 +; CHECK_O0-NEXT: # implicit-def: %YMM2 +; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2 +; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi) +; CHECK_O0-NEXT: vzeroupper +; CHECK_O0-NEXT: retq %b = load <4 x i64>, <4 x i64>* %bp, align 16 %x = add <4 x i64> zeroinitializer, %b store <4 x i64> %x, <4 x i64>* %ret, align 16 ret void } + diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll index 47e95fe31bdff..a12a412fb94d6 100644 --- a/test/CodeGen/X86/avx-schedule.ll +++ b/test/CodeGen/X86/avx-schedule.ll @@ -10,8 +10,8 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; SANDY-LABEL: test_addpd: ; SANDY: # BB#0: ; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_addpd: ; HASWELL: # BB#0: @@ -21,14 +21,14 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; BTVER2-LABEL: test_addpd: ; BTVER2: # BB#0: -; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_addpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = fadd <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 @@ -40,8 +40,8 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; SANDY-LABEL: test_addps: ; SANDY: # BB#0: ; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_addps: ; HASWELL: # BB#0: @@ -51,14 +51,14 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; BTVER2-LABEL: test_addps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_addps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = fadd <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 @@ -70,8 +70,8 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; SANDY-LABEL: test_addsubpd: ; SANDY: # BB#0: ; SANDY-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_addsubpd: ; HASWELL: # BB#0: @@ -81,14 +81,14 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; ; BTVER2-LABEL: test_addsubpd: ; BTVER2: # BB#0: -; BTVER2-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_addsubpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1) %2 = load <4 x double>, <4 x double> *%a2, align 32 @@ -101,8 +101,8 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> ; SANDY-LABEL: test_addsubps: ; SANDY: # BB#0: ; SANDY-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_addsubps: ; HASWELL: # BB#0: @@ -112,14 +112,14 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> ; ; BTVER2-LABEL: test_addsubps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_addsubps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = load <8 x float>, <8 x float> *%a2, align 32 @@ -131,10 +131,10 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { ; SANDY-LABEL: test_andnotpd: ; SANDY: # BB#0: -; SANDY-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; SANDY-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_andnotpd: ; HASWELL: # BB#0: @@ -147,14 +147,14 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; BTVER2: # BB#0: ; BTVER2-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; BTVER2-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_andnotpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; ZNVER1-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> @@ -172,10 +172,10 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { ; SANDY-LABEL: test_andnotps: ; SANDY: # BB#0: -; SANDY-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; SANDY-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_andnotps: ; HASWELL: # BB#0: @@ -188,14 +188,14 @@ define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float> ; BTVER2: # BB#0: ; BTVER2-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; BTVER2-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_andnotps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; ZNVER1-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> @@ -213,10 +213,10 @@ define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float> define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { ; SANDY-LABEL: test_andpd: ; SANDY: # BB#0: -; SANDY-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; SANDY-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] ; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_andpd: ; HASWELL: # BB#0: @@ -229,14 +229,14 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; BTVER2: # BB#0: ; BTVER2-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; BTVER2-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_andpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; ZNVER1-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> @@ -252,10 +252,10 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double> define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { ; SANDY-LABEL: test_andps: ; SANDY: # BB#0: -; SANDY-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; SANDY-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_andps: ; HASWELL: # BB#0: @@ -268,14 +268,14 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; BTVER2: # BB#0: ; BTVER2-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; BTVER2-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_andps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; ZNVER1-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> @@ -291,10 +291,10 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { ; SANDY-LABEL: test_blendpd: ; SANDY: # BB#0: -; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50] +; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:1.00] ; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_blendpd: ; HASWELL: # BB#0: @@ -306,14 +306,14 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl ; BTVER2-LABEL: test_blendpd: ; BTVER2: # BB#0: ; BTVER2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50] -; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_blendpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3> @@ -326,9 +326,9 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { ; SANDY-LABEL: test_blendps: ; SANDY: # BB#0: -; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50] -; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:1.00] +; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_blendps: ; HASWELL: # BB#0: @@ -356,9 +356,9 @@ define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> * define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) { ; SANDY-LABEL: test_blendvpd: ; SANDY: # BB#0: -; SANDY-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; SANDY-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00] +; SANDY-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:2.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_blendvpd: ; HASWELL: # BB#0: @@ -387,9 +387,9 @@ declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) { ; SANDY-LABEL: test_blendvps: ; SANDY: # BB#0: -; SANDY-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; SANDY-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00] +; SANDY-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:2.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_blendvps: ; HASWELL: # BB#0: @@ -418,8 +418,8 @@ declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x f define <8 x float> @test_broadcastf128(<4 x float> *%a0) { ; SANDY-LABEL: test_broadcastf128: ; SANDY: # BB#0: -; SANDY-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [3:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_broadcastf128: ; HASWELL: # BB#0: @@ -443,8 +443,8 @@ define <8 x float> @test_broadcastf128(<4 x float> *%a0) { define <4 x double> @test_broadcastsd_ymm(double *%a0) { ; SANDY-LABEL: test_broadcastsd_ymm: ; SANDY: # BB#0: -; SANDY-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_broadcastsd_ymm: ; HASWELL: # BB#0: @@ -469,8 +469,8 @@ define <4 x double> @test_broadcastsd_ymm(double *%a0) { define <4 x float> @test_broadcastss(float *%a0) { ; SANDY-LABEL: test_broadcastss: ; SANDY: # BB#0: -; SANDY-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [4:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [6:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_broadcastss: ; HASWELL: # BB#0: @@ -496,7 +496,7 @@ define <8 x float> @test_broadcastss_ymm(float *%a0) { ; SANDY-LABEL: test_broadcastss_ymm: ; SANDY: # BB#0: ; SANDY-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_broadcastss_ymm: ; HASWELL: # BB#0: @@ -522,9 +522,9 @@ define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; SANDY-LABEL: test_cmppd: ; SANDY: # BB#0: ; SANDY-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00] -; SANDY-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; SANDY-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cmppd: ; HASWELL: # BB#0: @@ -560,9 +560,9 @@ define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; SANDY-LABEL: test_cmpps: ; SANDY: # BB#0: ; SANDY-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00] -; SANDY-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; SANDY-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cmpps: ; HASWELL: # BB#0: @@ -598,9 +598,9 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) { ; SANDY-LABEL: test_cvtdq2pd: ; SANDY: # BB#0: ; SANDY-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00] -; SANDY-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00] +; SANDY-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [10:1.00] ; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtdq2pd: ; HASWELL: # BB#0: @@ -613,14 +613,14 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00] ; BTVER2-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_cvtdq2pd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00] ; ZNVER1-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = sitofp <4 x i32> %a0 to <4 x double> %2 = load <4 x i32>, <4 x i32> *%a1, align 16 @@ -632,12 +632,12 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) { define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) { ; SANDY-LABEL: test_cvtdq2ps: ; SANDY: # BB#0: -; SANDY-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00] -; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50] -; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [5:1.00] -; SANDY-NEXT: vcvtdq2ps %ymm1, %ymm1 # sched: [4:1.00] +; SANDY-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [6:0.50] +; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [7:1.00] +; SANDY-NEXT: vcvtdq2ps %ymm1, %ymm1 # sched: [3:1.00] ; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtdq2ps: ; HASWELL: # BB#0: @@ -650,14 +650,14 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00] ; BTVER2-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_cvtdq2ps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00] ; ZNVER1-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = sitofp <8 x i32> %a0 to <8 x float> %2 = load <8 x i32>, <8 x i32> *%a1, align 16 @@ -669,10 +669,10 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) { define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) { ; SANDY-LABEL: test_cvtpd2dq: ; SANDY: # BB#0: -; SANDY-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [4:1.00] +; SANDY-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [11:1.00] ; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtpd2dq: ; HASWELL: # BB#0: @@ -704,10 +704,10 @@ define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) { define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) { ; SANDY-LABEL: test_cvtpd2ps: ; SANDY: # BB#0: -; SANDY-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [4:1.00] +; SANDY-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [11:1.00] ; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtpd2ps: ; HASWELL: # BB#0: @@ -741,8 +741,8 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) { ; SANDY: # BB#0: ; SANDY-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00] ; SANDY-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00] -; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtps2dq: ; HASWELL: # BB#0: @@ -774,9 +774,9 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) { define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { ; SANDY-LABEL: test_divpd: ; SANDY: # BB#0: -; SANDY-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [12:1.00] -; SANDY-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [16:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [45:3.00] +; SANDY-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [52:3.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_divpd: ; HASWELL: # BB#0: @@ -786,14 +786,14 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; BTVER2-LABEL: test_divpd: ; BTVER2: # BB#0: -; BTVER2-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [19:19.00] -; BTVER2-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [24:19.00] +; BTVER2-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [38:38.00] +; BTVER2-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [43:38.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_divpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [19:19.00] -; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [24:19.00] +; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [38:38.00] +; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [43:38.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = fdiv <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 @@ -804,9 +804,9 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double> define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { ; SANDY-LABEL: test_divps: ; SANDY: # BB#0: -; SANDY-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [12:1.00] -; SANDY-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [16:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [29:3.00] +; SANDY-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [36:3.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_divps: ; HASWELL: # BB#0: @@ -816,14 +816,14 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; BTVER2-LABEL: test_divps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:19.00] -; BTVER2-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [24:19.00] +; BTVER2-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [38:38.00] +; BTVER2-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [43:38.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_divps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:19.00] -; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [24:19.00] +; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [38:38.00] +; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [43:38.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = fdiv <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 @@ -834,9 +834,9 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { ; SANDY-LABEL: test_dpps: ; SANDY: # BB#0: -; SANDY-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; SANDY-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [12:2.00] ; SANDY-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_dpps: ; HASWELL: # BB#0: @@ -866,9 +866,9 @@ define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x floa ; SANDY-LABEL: test_extractf128: ; SANDY: # BB#0: ; SANDY-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00] -; SANDY-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [5:1.00] ; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_extractf128: ; HASWELL: # BB#0: @@ -900,7 +900,7 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double ; SANDY: # BB#0: ; SANDY-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; SANDY-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_haddpd: ; HASWELL: # BB#0: @@ -929,9 +929,9 @@ declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounw define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { ; SANDY-LABEL: test_haddps: ; SANDY: # BB#0: -; SANDY-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00] +; SANDY-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [12:2.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_haddps: ; HASWELL: # BB#0: @@ -960,9 +960,9 @@ declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { ; SANDY-LABEL: test_hsubpd: ; SANDY: # BB#0: -; SANDY-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00] +; SANDY-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_hsubpd: ; HASWELL: # BB#0: @@ -991,9 +991,9 @@ declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounw define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { ; SANDY-LABEL: test_hsubps: ; SANDY: # BB#0: -; SANDY-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00] +; SANDY-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [12:2.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_hsubps: ; HASWELL: # BB#0: @@ -1023,9 +1023,9 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float ; SANDY-LABEL: test_insertf128: ; SANDY: # BB#0: ; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00] -; SANDY-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:1.00] ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_insertf128: ; HASWELL: # BB#0: @@ -1038,14 +1038,14 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float ; BTVER2: # BB#0: ; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50] ; BTVER2-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_insertf128: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50] ; ZNVER1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> %2 = shufflevector <8 x float> %a0, <8 x float> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> @@ -1059,8 +1059,8 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float define <32 x i8> @test_lddqu(i8* %a0) { ; SANDY-LABEL: test_lddqu: ; SANDY: # BB#0: -; SANDY-NEXT: vlddqu (%rdi), %ymm0 # sched: [4:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vlddqu (%rdi), %ymm0 # sched: [6:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_lddqu: ; HASWELL: # BB#0: @@ -1084,10 +1084,10 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) { ; SANDY-LABEL: test_maskmovpd: ; SANDY: # BB#0: -; SANDY-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] -; SANDY-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] +; SANDY-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [8:2.00] +; SANDY-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [5:1.00] ; SANDY-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_maskmovpd: ; HASWELL: # BB#0: @@ -1119,10 +1119,10 @@ declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2) { ; SANDY-LABEL: test_maskmovpd_ymm: ; SANDY: # BB#0: -; SANDY-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] +; SANDY-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [5:1.00] ; SANDY-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] ; SANDY-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_maskmovpd_ymm: ; HASWELL: # BB#0: @@ -1154,10 +1154,10 @@ declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwi define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) { ; SANDY-LABEL: test_maskmovps: ; SANDY: # BB#0: -; SANDY-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00] -; SANDY-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00] +; SANDY-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [8:2.00] +; SANDY-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [5:1.00] ; SANDY-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_maskmovps: ; HASWELL: # BB#0: @@ -1189,10 +1189,10 @@ declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2) { ; SANDY-LABEL: test_maskmovps_ymm: ; SANDY: # BB#0: -; SANDY-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00] +; SANDY-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [1:0.50] ; SANDY-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00] ; SANDY-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_maskmovps_ymm: ; HASWELL: # BB#0: @@ -1225,8 +1225,8 @@ define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; SANDY-LABEL: test_maxpd: ; SANDY: # BB#0: ; SANDY-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_maxpd: ; HASWELL: # BB#0: @@ -1256,8 +1256,8 @@ define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; SANDY-LABEL: test_maxps: ; SANDY: # BB#0: ; SANDY-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_maxps: ; HASWELL: # BB#0: @@ -1288,7 +1288,7 @@ define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; SANDY: # BB#0: ; SANDY-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; SANDY-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_minpd: ; HASWELL: # BB#0: @@ -1319,7 +1319,7 @@ define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; SANDY: # BB#0: ; SANDY-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] ; SANDY-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_minps: ; HASWELL: # BB#0: @@ -1348,10 +1348,10 @@ declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) { ; SANDY-LABEL: test_movapd: ; SANDY: # BB#0: -; SANDY-NEXT: vmovapd (%rdi), %ymm0 # sched: [4:0.50] +; SANDY-NEXT: vmovapd (%rdi), %ymm0 # sched: [7:0.50] ; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovapd %ymm0, (%rsi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movapd: ; HASWELL: # BB#0: @@ -1363,14 +1363,14 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) { ; BTVER2-LABEL: test_movapd: ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00] -; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_movapd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = load <4 x double>, <4 x double> *%a0, align 32 @@ -1382,10 +1382,10 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) { define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) { ; SANDY-LABEL: test_movaps: ; SANDY: # BB#0: -; SANDY-NEXT: vmovaps (%rdi), %ymm0 # sched: [4:0.50] +; SANDY-NEXT: vmovaps (%rdi), %ymm0 # sched: [7:0.50] ; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovaps %ymm0, (%rsi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movaps: ; HASWELL: # BB#0: @@ -1397,14 +1397,14 @@ define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) { ; BTVER2-LABEL: test_movaps: ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00] -; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_movaps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = load <8 x float>, <8 x float> *%a0, align 32 @@ -1417,9 +1417,9 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) { ; SANDY-LABEL: test_movddup: ; SANDY: # BB#0: ; SANDY-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00] -; SANDY-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50] +; SANDY-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [7:0.50] ; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movddup: ; HASWELL: # BB#0: @@ -1432,14 +1432,14 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00] ; BTVER2-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50] -; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_movddup: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00] ; ZNVER1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> %2 = load <4 x double>, <4 x double> *%a1, align 32 @@ -1451,9 +1451,9 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) { define i32 @test_movmskpd(<4 x double> %a0) { ; SANDY-LABEL: test_movmskpd: ; SANDY: # BB#0: -; SANDY-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.33] +; SANDY-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00] ; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movmskpd: ; HASWELL: # BB#0: @@ -1479,9 +1479,9 @@ declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone define i32 @test_movmskps(<8 x float> %a0) { ; SANDY-LABEL: test_movmskps: ; SANDY: # BB#0: -; SANDY-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.33] +; SANDY-NEXT: vmovmskps %ymm0, %eax # sched: [3:1.00] ; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movmskps: ; HASWELL: # BB#0: @@ -1508,8 +1508,8 @@ define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) { ; SANDY-LABEL: test_movntpd: ; SANDY: # BB#0: ; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovntpd %ymm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movntpd: ; HASWELL: # BB#0: @@ -1519,13 +1519,13 @@ define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) { ; ; BTVER2-LABEL: test_movntpd: ; BTVER2: # BB#0: -; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_movntpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = fadd <4 x double> %a0, %a0 @@ -1537,8 +1537,8 @@ define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) { ; SANDY-LABEL: test_movntps: ; SANDY: # BB#0: ; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovntps %ymm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movntps: ; HASWELL: # BB#0: @@ -1548,13 +1548,13 @@ define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) { ; ; BTVER2-LABEL: test_movntps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_movntps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = fadd <8 x float> %a0, %a0 @@ -1566,9 +1566,9 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) { ; SANDY-LABEL: test_movshdup: ; SANDY: # BB#0: ; SANDY-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00] -; SANDY-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50] +; SANDY-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [7:0.50] ; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movshdup: ; HASWELL: # BB#0: @@ -1581,14 +1581,14 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00] ; BTVER2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50] -; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_movshdup: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00] ; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> %2 = load <8 x float>, <8 x float> *%a1, align 32 @@ -1601,9 +1601,9 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) { ; SANDY-LABEL: test_movsldup: ; SANDY: # BB#0: ; SANDY-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00] -; SANDY-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50] +; SANDY-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [7:0.50] ; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movsldup: ; HASWELL: # BB#0: @@ -1616,14 +1616,14 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00] ; BTVER2-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50] -; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_movsldup: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00] ; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> %2 = load <8 x float>, <8 x float> *%a1, align 32 @@ -1635,12 +1635,12 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) { define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) { ; SANDY-LABEL: test_movupd: ; SANDY: # BB#0: -; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50] -; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50] +; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:1.00] ; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00] -; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00] +; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movupd: ; HASWELL: # BB#0: @@ -1652,14 +1652,14 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) { ; BTVER2-LABEL: test_movupd: ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00] -; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_movupd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = load <4 x double>, <4 x double> *%a0, align 1 @@ -1671,12 +1671,12 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) { define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) { ; SANDY-LABEL: test_movups: ; SANDY: # BB#0: -; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50] -; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50] +; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:1.00] ; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00] -; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00] +; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movups: ; HASWELL: # BB#0: @@ -1688,14 +1688,14 @@ define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) { ; BTVER2-LABEL: test_movups: ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00] -; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_movups: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = load <8 x float>, <8 x float> *%a0, align 1 @@ -1708,8 +1708,8 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; SANDY-LABEL: test_mulpd: ; SANDY: # BB#0: ; SANDY-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; SANDY-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_mulpd: ; HASWELL: # BB#0: @@ -1719,14 +1719,14 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; BTVER2-LABEL: test_mulpd: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; BTVER2-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:4.00] +; BTVER2-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:4.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_mulpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:4.00] +; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:4.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = fmul <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 @@ -1738,8 +1738,8 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; SANDY-LABEL: test_mulps: ; SANDY: # BB#0: ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; SANDY-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [12:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_mulps: ; HASWELL: # BB#0: @@ -1749,14 +1749,14 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; BTVER2-LABEL: test_mulps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; BTVER2-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] +; BTVER2-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_mulps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] +; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] +; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = fmul <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 @@ -1767,10 +1767,10 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { ; SANDY-LABEL: orpd: ; SANDY: # BB#0: -; SANDY-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; SANDY-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: orpd: ; HASWELL: # BB#0: @@ -1783,14 +1783,14 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) ; BTVER2: # BB#0: ; BTVER2-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; BTVER2-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: orpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; ZNVER1-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> @@ -1806,10 +1806,10 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { ; SANDY-LABEL: test_orps: ; SANDY: # BB#0: -; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; SANDY-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_orps: ; HASWELL: # BB#0: @@ -1822,14 +1822,14 @@ define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2 ; BTVER2: # BB#0: ; BTVER2-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; BTVER2-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_orps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; ZNVER1-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> @@ -1846,9 +1846,9 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) { ; SANDY-LABEL: test_permilpd: ; SANDY: # BB#0: ; SANDY-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00] -; SANDY-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00] +; SANDY-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [7:1.00] ; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_permilpd: ; HASWELL: # BB#0: @@ -1880,10 +1880,10 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) { define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) { ; SANDY-LABEL: test_permilpd_ymm: ; SANDY: # BB#0: -; SANDY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00] +; SANDY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [8:1.00] ; SANDY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00] ; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_permilpd_ymm: ; HASWELL: # BB#0: @@ -1896,14 +1896,14 @@ define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00] ; BTVER2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50] -; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_permilpd_ymm: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00] ; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3> %2 = load <4 x double>, <4 x double> *%a1, align 32 @@ -1916,9 +1916,9 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) { ; SANDY-LABEL: test_permilps: ; SANDY: # BB#0: ; SANDY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00] -; SANDY-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00] +; SANDY-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_permilps: ; HASWELL: # BB#0: @@ -1950,10 +1950,10 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) { define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) { ; SANDY-LABEL: test_permilps_ymm: ; SANDY: # BB#0: -; SANDY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00] +; SANDY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [8:1.00] ; SANDY-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00] ; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_permilps_ymm: ; HASWELL: # BB#0: @@ -1966,14 +1966,14 @@ define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00] ; BTVER2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50] -; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_permilps_ymm: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00] ; ZNVER1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> %2 = load <8 x float>, <8 x float> *%a1, align 32 @@ -1986,8 +1986,8 @@ define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64> ; SANDY-LABEL: test_permilvarpd: ; SANDY: # BB#0: ; SANDY-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; SANDY-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_permilvarpd: ; HASWELL: # BB#0: @@ -2018,7 +2018,7 @@ define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x ; SANDY: # BB#0: ; SANDY-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] ; SANDY-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_permilvarpd_ymm: ; HASWELL: # BB#0: @@ -2048,8 +2048,8 @@ define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> * ; SANDY-LABEL: test_permilvarps: ; SANDY: # BB#0: ; SANDY-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; SANDY-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_permilvarps: ; HASWELL: # BB#0: @@ -2080,7 +2080,7 @@ define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i3 ; SANDY: # BB#0: ; SANDY-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] ; SANDY-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_permilvarps_ymm: ; HASWELL: # BB#0: @@ -2112,7 +2112,7 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) { ; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00] ; SANDY-NEXT: vrcpps (%rdi), %ymm1 # sched: [9:1.00] ; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_rcpps: ; HASWELL: # BB#0: @@ -2123,16 +2123,16 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) { ; ; BTVER2-LABEL: test_rcpps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:1.00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00] -; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:2.00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_rcpps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:1.00] -; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:2.00] +; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) %2 = load <8 x float>, <8 x float> *%a1, align 32 @@ -2148,7 +2148,7 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) { ; SANDY-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00] ; SANDY-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [7:1.00] ; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_roundpd: ; HASWELL: # BB#0: @@ -2161,14 +2161,14 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00] ; BTVER2-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_roundpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00] ; ZNVER1-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) %2 = load <4 x double>, <4 x double> *%a1, align 32 @@ -2184,7 +2184,7 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) { ; SANDY-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00] ; SANDY-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [7:1.00] ; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_roundps: ; HASWELL: # BB#0: @@ -2197,14 +2197,14 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) { ; BTVER2: # BB#0: ; BTVER2-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00] ; BTVER2-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_roundps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00] ; ZNVER1-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) %2 = load <8 x float>, <8 x float> *%a1, align 32 @@ -2217,10 +2217,10 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) { ; SANDY-LABEL: test_rsqrtps: ; SANDY: # BB#0: -; SANDY-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [5:1.00] -; SANDY-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [9:1.00] +; SANDY-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [14:3.00] +; SANDY-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [7:3.00] ; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_rsqrtps: ; HASWELL: # BB#0: @@ -2231,16 +2231,16 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) { ; ; BTVER2-LABEL: test_rsqrtps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:1.00] -; BTVER2-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:1.00] -; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:2.00] +; BTVER2-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:2.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_rsqrtps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:1.00] -; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:1.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:2.00] +; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:2.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) %2 = load <8 x float>, <8 x float> *%a1, align 32 @@ -2254,9 +2254,9 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double ; SANDY-LABEL: test_shufpd: ; SANDY: # BB#0: ; SANDY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00] -; SANDY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00] +; SANDY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:1.00] ; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_shufpd: ; HASWELL: # BB#0: @@ -2269,14 +2269,14 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double ; BTVER2: # BB#0: ; BTVER2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50] ; BTVER2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00] -; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_shufpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50] ; ZNVER1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 2, i32 7> %2 = load <4 x double>, <4 x double> *%a2, align 32 @@ -2289,8 +2289,8 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *% ; SANDY-LABEL: test_shufps: ; SANDY: # BB#0: ; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00] -; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_shufps: ; HASWELL: # BB#0: @@ -2318,10 +2318,10 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *% define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) { ; SANDY-LABEL: test_sqrtpd: ; SANDY: # BB#0: -; SANDY-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [15:1.00] -; SANDY-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [19:1.00] +; SANDY-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [52:3.00] +; SANDY-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [45:3.00] ; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_sqrtpd: ; HASWELL: # BB#0: @@ -2332,16 +2332,16 @@ define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) { ; ; BTVER2-LABEL: test_sqrtpd: ; BTVER2: # BB#0: -; BTVER2-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [26:21.00] -; BTVER2-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [21:21.00] -; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [59:54.00] +; BTVER2-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [54:54.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_sqrtpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [26:21.00] -; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [21:21.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [59:54.00] +; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [54:54.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) %2 = load <4 x double>, <4 x double> *%a1, align 32 @@ -2354,10 +2354,10 @@ declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) { ; SANDY-LABEL: test_sqrtps: ; SANDY: # BB#0: -; SANDY-NEXT: vsqrtps %ymm0, %ymm0 # sched: [15:1.00] -; SANDY-NEXT: vsqrtps (%rdi), %ymm1 # sched: [19:1.00] +; SANDY-NEXT: vsqrtps (%rdi), %ymm1 # sched: [36:3.00] +; SANDY-NEXT: vsqrtps %ymm0, %ymm0 # sched: [29:3.00] ; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_sqrtps: ; HASWELL: # BB#0: @@ -2368,16 +2368,16 @@ define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) { ; ; BTVER2-LABEL: test_sqrtps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vsqrtps (%rdi), %ymm1 # sched: [26:21.00] -; BTVER2-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:21.00] -; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vsqrtps (%rdi), %ymm1 # sched: [47:42.00] +; BTVER2-NEXT: vsqrtps %ymm0, %ymm0 # sched: [42:42.00] +; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_sqrtps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [26:21.00] -; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:21.00] -; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [47:42.00] +; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [42:42.00] +; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) %2 = load <8 x float>, <8 x float> *%a1, align 32 @@ -2391,8 +2391,8 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; SANDY-LABEL: test_subpd: ; SANDY: # BB#0: ; SANDY-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_subpd: ; HASWELL: # BB#0: @@ -2402,14 +2402,14 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; ; BTVER2-LABEL: test_subpd: ; BTVER2: # BB#0: -; BTVER2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_subpd: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = fsub <4 x double> %a0, %a1 %2 = load <4 x double>, <4 x double> *%a2, align 32 @@ -2421,8 +2421,8 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; SANDY-LABEL: test_subps: ; SANDY: # BB#0: ; SANDY-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_subps: ; HASWELL: # BB#0: @@ -2432,14 +2432,14 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; ; BTVER2-LABEL: test_subps: ; BTVER2: # BB#0: -; BTVER2-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; BTVER2-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_subps: ; ZNVER1: # BB#0: -; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:1.00] +; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00] +; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = fsub <8 x float> %a0, %a1 %2 = load <8 x float>, <8 x float> *%a2, align 32 @@ -2451,11 +2451,11 @@ define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; SANDY-LABEL: test_testpd: ; SANDY: # BB#0: ; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33] -; SANDY-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: setb %al # sched: [1:0.33] -; SANDY-NEXT: vtestpd (%rdi), %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: setb %al # sched: [1:1.00] +; SANDY-NEXT: vtestpd (%rdi), %xmm0 # sched: [7:1.00] ; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_testpd: ; HASWELL: # BB#0: @@ -2495,12 +2495,12 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a ; SANDY-LABEL: test_testpd_ymm: ; SANDY: # BB#0: ; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33] -; SANDY-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.33] -; SANDY-NEXT: setb %al # sched: [1:0.33] -; SANDY-NEXT: vtestpd (%rdi), %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: setb %al # sched: [1:1.00] +; SANDY-NEXT: vtestpd (%rdi), %ymm0 # sched: [8:1.00] ; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33] ; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_testpd_ymm: ; HASWELL: # BB#0: @@ -2542,11 +2542,11 @@ define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { ; SANDY-LABEL: test_testps: ; SANDY: # BB#0: ; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33] -; SANDY-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: setb %al # sched: [1:0.33] -; SANDY-NEXT: vtestps (%rdi), %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vtestps %xmm1, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: setb %al # sched: [1:1.00] +; SANDY-NEXT: vtestps (%rdi), %xmm0 # sched: [7:1.00] ; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_testps: ; HASWELL: # BB#0: @@ -2586,12 +2586,12 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) ; SANDY-LABEL: test_testps_ymm: ; SANDY: # BB#0: ; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33] -; SANDY-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.33] -; SANDY-NEXT: setb %al # sched: [1:0.33] -; SANDY-NEXT: vtestps (%rdi), %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vtestps %ymm1, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: setb %al # sched: [1:1.00] +; SANDY-NEXT: vtestps (%rdi), %ymm0 # sched: [8:1.00] ; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33] ; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_testps_ymm: ; HASWELL: # BB#0: @@ -2635,7 +2635,7 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; SANDY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] ; SANDY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00] ; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_unpckhpd: ; HASWELL: # BB#0: @@ -2648,14 +2648,14 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; BTVER2: # BB#0: ; BTVER2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50] ; BTVER2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00] -; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_unpckhpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50] ; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> %2 = load <4 x double>, <4 x double> *%a2, align 32 @@ -2669,7 +2669,7 @@ define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float> ; SANDY: # BB#0: ; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] ; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_unpckhps: ; HASWELL: # BB#0: @@ -2698,9 +2698,9 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; SANDY-LABEL: test_unpcklpd: ; SANDY: # BB#0: ; SANDY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] -; SANDY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00] +; SANDY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:1.00] ; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_unpcklpd: ; HASWELL: # BB#0: @@ -2713,14 +2713,14 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub ; BTVER2: # BB#0: ; BTVER2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50] ; BTVER2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00] -; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_unpcklpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50] ; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> %2 = load <4 x double>, <4 x double> *%a2, align 32 @@ -2733,8 +2733,8 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float> ; SANDY-LABEL: test_unpcklps: ; SANDY: # BB#0: ; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] -; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_unpcklps: ; HASWELL: # BB#0: @@ -2762,10 +2762,10 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float> define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) { ; SANDY-LABEL: test_xorpd: ; SANDY: # BB#0: -; SANDY-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; SANDY-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00] ; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_xorpd: ; HASWELL: # BB#0: @@ -2778,14 +2778,14 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double> ; BTVER2: # BB#0: ; BTVER2-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; BTVER2-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_xorpd: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; ZNVER1-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = bitcast <4 x double> %a0 to <4 x i64> %2 = bitcast <4 x double> %a1 to <4 x i64> @@ -2801,10 +2801,10 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double> define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) { ; SANDY-LABEL: test_xorps: ; SANDY: # BB#0: -; SANDY-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33] -; SANDY-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50] +; SANDY-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00] +; SANDY-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_xorps: ; HASWELL: # BB#0: @@ -2817,14 +2817,14 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ; BTVER2: # BB#0: ; BTVER2-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; BTVER2-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-LABEL: test_xorps: ; ZNVER1: # BB#0: ; ZNVER1-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; ZNVER1-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00] -; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; ZNVER1-NEXT: retq # sched: [4:1.00] %1 = bitcast <8 x float> %a0 to <4 x i64> %2 = bitcast <8 x float> %a1 to <4 x i64> @@ -2841,7 +2841,7 @@ define void @test_zeroall() { ; SANDY-LABEL: test_zeroall: ; SANDY: # BB#0: ; SANDY-NEXT: vzeroall # sched: [?:0.000000e+00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_zeroall: ; HASWELL: # BB#0: @@ -2866,7 +2866,7 @@ define void @test_zeroupper() { ; SANDY-LABEL: test_zeroupper: ; SANDY: # BB#0: ; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_zeroupper: ; HASWELL: # BB#0: diff --git a/test/CodeGen/X86/avx-unpack.ll b/test/CodeGen/X86/avx-unpack.ll index 6924d98b38b17..7826bc97eec57 100644 --- a/test/CodeGen/X86/avx-unpack.ll +++ b/test/CodeGen/X86/avx-unpack.ll @@ -1,57 +1,84 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s -; CHECK: vunpckhps define <8 x float> @unpackhips(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpackhips: +; CHECK: # BB#0: +; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; CHECK-NEXT: retq %shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> ret <8 x float> %shuffle.i } -; CHECK: vunpckhpd define <4 x double> @unpackhipd(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpackhipd: +; CHECK: # BB#0: +; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-NEXT: retq %shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 1, i32 5, i32 3, i32 7> ret <4 x double> %shuffle.i } -; CHECK: vunpcklps define <8 x float> @unpacklops(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpacklops: +; CHECK: # BB#0: +; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; CHECK-NEXT: retq %shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> ret <8 x float> %shuffle.i } -; CHECK: vunpcklpd define <4 x double> @unpacklopd(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpacklopd: +; CHECK: # BB#0: +; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-NEXT: retq %shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 0, i32 4, i32 2, i32 6> ret <4 x double> %shuffle.i } -; CHECK-NOT: vunpcklps %ymm -define <8 x float> @unpacklops-not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp { -entry: +define <8 x float> @unpacklops_not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp { +; CHECK-LABEL: unpacklops_not: +; CHECK: # BB#0: +; CHECK-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: retq %shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> ret <8 x float> %shuffle.i } -; CHECK-NOT: vunpcklpd %ymm -define <4 x double> @unpacklopd-not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp { -entry: +define <4 x double> @unpacklopd_not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp { +; CHECK-LABEL: unpacklopd_not: +; CHECK: # BB#0: +; CHECK-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: retq %shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> ret <4 x double> %shuffle.i } -; CHECK-NOT: vunpckhps %ymm -define <8 x float> @unpackhips-not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp { -entry: +define <8 x float> @unpackhips_not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp { +; CHECK-LABEL: unpackhips_not: +; CHECK: # BB#0: +; CHECK-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,3,u,4,u,5] +; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,u,3,u,4,u,5,u] +; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; CHECK-NEXT: retq %shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13> ret <8 x float> %shuffle.i } -; CHECK-NOT: vunpckhpd %ymm -define <4 x double> @unpackhipd-not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp { -entry: +define <4 x double> @unpackhipd_not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp { +; CHECK-LABEL: unpackhipd_not: +; CHECK: # BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: retq %shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 2, i32 6, i32 3, i32 7> ret <4 x double> %shuffle.i } @@ -60,102 +87,135 @@ entry: ;;;; Unpack versions using the fp unit for int unpacking ;;;; -; CHECK: vunpckhps define <8 x i32> @unpackhips1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpackhips1: +; CHECK: # BB#0: +; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; CHECK-NEXT: retq %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> ret <8 x i32> %shuffle.i } -; CHECK: vunpckhps (% define <8 x i32> @unpackhips2(<8 x i32>* %src1, <8 x i32>* %src2) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpackhips2: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; CHECK-NEXT: retq %a = load <8 x i32>, <8 x i32>* %src1 %b = load <8 x i32>, <8 x i32>* %src2 %shuffle.i = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> ret <8 x i32> %shuffle.i } -; CHECK: vunpckhpd define <4 x i64> @unpackhipd1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpackhipd1: +; CHECK: # BB#0: +; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-NEXT: retq %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 1, i32 5, i32 3, i32 7> ret <4 x i64> %shuffle.i } -; CHECK: vunpckhpd (% define <4 x i64> @unpackhipd2(<4 x i64>* %src1, <4 x i64>* %src2) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpackhipd2: +; CHECK: # BB#0: +; CHECK-NEXT: vmovapd (%rdi), %ymm0 +; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; CHECK-NEXT: retq %a = load <4 x i64>, <4 x i64>* %src1 %b = load <4 x i64>, <4 x i64>* %src2 %shuffle.i = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> ret <4 x i64> %shuffle.i } -; CHECK: vunpcklps define <8 x i32> @unpacklops1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpacklops1: +; CHECK: # BB#0: +; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; CHECK-NEXT: retq %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> ret <8 x i32> %shuffle.i } -; CHECK: vunpcklps (% define <8 x i32> @unpacklops2(<8 x i32>* %src1, <8 x i32>* %src2) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpacklops2: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdi), %ymm0 +; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; CHECK-NEXT: retq %a = load <8 x i32>, <8 x i32>* %src1 %b = load <8 x i32>, <8 x i32>* %src2 %shuffle.i = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> ret <8 x i32> %shuffle.i } -; CHECK: vunpcklpd define <4 x i64> @unpacklopd1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpacklopd1: +; CHECK: # BB#0: +; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-NEXT: retq %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 0, i32 4, i32 2, i32 6> ret <4 x i64> %shuffle.i } -; CHECK: vunpcklpd (% define <4 x i64> @unpacklopd2(<4 x i64>* %src1, <4 x i64>* %src2) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpacklopd2: +; CHECK: # BB#0: +; CHECK-NEXT: vmovapd (%rdi), %ymm0 +; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-NEXT: retq %a = load <4 x i64>, <4 x i64>* %src1 %b = load <4 x i64>, <4 x i64>* %src2 %shuffle.i = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> ret <4 x i64> %shuffle.i } -; CHECK: vpunpckhwd -; CHECK: vpunpckhwd -; CHECK: vinsertf128 define <16 x i16> @unpackhwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpackhwd_undef: +; CHECK: # BB#0: +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> ret <16 x i16> %shuffle.i } -; CHECK: vpunpcklwd -; CHECK: vpunpcklwd -; CHECK: vinsertf128 define <16 x i16> @unpacklwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpacklwd_undef: +; CHECK: # BB#0: +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3] +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> ret <16 x i16> %shuffle.i } -; CHECK: vpunpckhbw -; CHECK: vpunpckhbw -; CHECK: vinsertf128 define <32 x i8> @unpackhbw_undef(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpackhbw_undef: +; CHECK: # BB#0: +; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> ret <32 x i8> %shuffle.i } -; CHECK: vpunpcklbw -; CHECK: vpunpcklbw -; CHECK: vinsertf128 define <32 x i8> @unpacklbw_undef(<32 x i8> %src1) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: unpacklbw_undef: +; CHECK: # BB#0: +; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> ret <32 x i8> %shuffle.i } + diff --git a/test/CodeGen/X86/avx-vinsertf128.ll b/test/CodeGen/X86/avx-vinsertf128.ll index 38389de7a8a10..b7a4d5b5c308e 100644 --- a/test/CodeGen/X86/avx-vinsertf128.ll +++ b/test/CodeGen/X86/avx-vinsertf128.ll @@ -1,30 +1,37 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s -; CHECK-LABEL: A: -; CHECK-NOT: vunpck -; CHECK: vinsertf128 $1 define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: A: +; CHECK: # BB#0: +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3> ret <8 x float> %shuffle } -; CHECK-LABEL: B: -; CHECK-NOT: vunpck -; CHECK: vinsertf128 $1 define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp { -entry: +; CHECK-LABEL: B: +; CHECK: # BB#0: +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 1> ret <4 x double> %shuffle } declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone - declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone -; Just check that no crash happens -; CHECK-LABEL: _insert_crash: define void @insert_crash() nounwind { +; CHECK-LABEL: insert_crash: +; CHECK: # BB#0: # %allocas +; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vminpd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vminsd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] +; CHECK-NEXT: vmovups %xmm0, (%rax) +; CHECK-NEXT: retq allocas: %v1.i.i451 = shufflevector <4 x double> zeroinitializer, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> %ret_0a.i.i.i452 = shufflevector <4 x double> %v1.i.i451, <4 x double> undef, <2 x i32> <i32 0, i32 1> @@ -40,72 +47,87 @@ allocas: ;; DAG Combine must remove useless vinsertf128 instructions -; CHECK-LABEL: DAGCombineA: -; CHECK-NOT: vinsertf128 $1 define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly { - %1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> - ret <4 x i32> %2 +; CHECK-LABEL: DAGCombineA: +; CHECK: # BB#0: +; CHECK-NEXT: retq + %t1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %t2 = shufflevector <8 x i32> %t1, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x i32> %t2 } -; CHECK-LABEL: DAGCombineB: -; CHECK: vpaddd %xmm -; CHECK-NOT: vinsertf128 $1 -; CHECK: vpaddd %xmm define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly { - %1 = add <8 x i32> %v1, %v2 - %2 = add <8 x i32> %1, %v1 - ret <8 x i32> %2 +; CHECK-LABEL: DAGCombineB: +; CHECK: # BB#0: +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: retq + %t1 = add <8 x i32> %v1, %v2 + %t2 = add <8 x i32> %t1, %v1 + ret <8 x i32> %t2 } -; CHECK-LABEL: insert_undef_pd: define <4 x double> @insert_undef_pd(<4 x double> %a0, <2 x double> %a1) { -; CHECK: vmovaps %ymm1, %ymm0 +; CHECK-LABEL: insert_undef_pd: +; CHECK: # BB#0: +; CHECK-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def> +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> undef, <2 x double> %a1, i8 0) ret <4 x double> %res } declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone - -; CHECK-LABEL: insert_undef_ps: define <8 x float> @insert_undef_ps(<8 x float> %a0, <4 x float> %a1) { -; CHECK: vmovaps %ymm1, %ymm0 +; CHECK-LABEL: insert_undef_ps: +; CHECK: # BB#0: +; CHECK-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def> +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %a1, i8 0) ret <8 x float> %res } declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone - -; CHECK-LABEL: insert_undef_si: define <8 x i32> @insert_undef_si(<8 x i32> %a0, <4 x i32> %a1) { -; CHECK: vmovaps %ymm1, %ymm0 +; CHECK-LABEL: insert_undef_si: +; CHECK: # BB#0: +; CHECK-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def> +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> undef, <4 x i32> %a1, i8 0) ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone ; rdar://10643481 -; CHECK-LABEL: vinsertf128_combine: define <8 x float> @vinsertf128_combine(float* nocapture %f) nounwind uwtable readonly ssp { -; CHECK-NOT: vmovaps -; CHECK: vinsertf128 -entry: +; CHECK-LABEL: vinsertf128_combine: +; CHECK: # BB#0: +; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 +; CHECK-NEXT: retq %add.ptr = getelementptr inbounds float, float* %f, i64 4 - %0 = bitcast float* %add.ptr to <4 x float>* - %1 = load <4 x float>, <4 x float>* %0, align 16 - %2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %1, i8 1) - ret <8 x float> %2 + %t0 = bitcast float* %add.ptr to <4 x float>* + %t1 = load <4 x float>, <4 x float>* %t0, align 16 + %t2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %t1, i8 1) + ret <8 x float> %t2 } ; rdar://11076953 -; CHECK-LABEL: vinsertf128_ucombine: define <8 x float> @vinsertf128_ucombine(float* nocapture %f) nounwind uwtable readonly ssp { -; CHECK-NOT: vmovups -; CHECK: vinsertf128 -entry: +; CHECK-LABEL: vinsertf128_ucombine: +; CHECK: # BB#0: +; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 +; CHECK-NEXT: retq %add.ptr = getelementptr inbounds float, float* %f, i64 4 - %0 = bitcast float* %add.ptr to <4 x float>* - %1 = load <4 x float>, <4 x float>* %0, align 8 - %2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %1, i8 1) - ret <8 x float> %2 + %t0 = bitcast float* %add.ptr to <4 x float>* + %t1 = load <4 x float>, <4 x float>* %t0, align 8 + %t2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %t1, i8 1) + ret <8 x float> %t2 } + diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll index 971d03af3778a..318c9cfd8a3fc 100644 --- a/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/test/CodeGen/X86/avx2-vbroadcast.ll @@ -633,13 +633,13 @@ entry: define <8 x i32> @V111(<8 x i32> %in) nounwind uwtable readnone ssp { ; X32-AVX2-LABEL: V111: ; X32-AVX2: ## BB#0: ## %entry -; X32-AVX2-NEXT: vpbroadcastd LCPI29_0, %ymm1 +; X32-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] ; X32-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; X32-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: V111: ; X64-AVX2: ## BB#0: ## %entry -; X64-AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] ; X64-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; @@ -660,13 +660,13 @@ entry: define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp { ; X32-AVX2-LABEL: V113: ; X32-AVX2: ## BB#0: ## %entry -; X32-AVX2-NEXT: vbroadcastss LCPI30_0, %ymm1 +; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125] ; X32-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X32-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: V113: ; X64-AVX2: ## BB#0: ## %entry -; X64-AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125] ; X64-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; @@ -687,12 +687,12 @@ entry: define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp { ; X32-LABEL: _e2: ; X32: ## BB#0: -; X32-NEXT: vbroadcastss LCPI31_0, %xmm0 +; X32-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125] ; X32-NEXT: retl ; ; X64-LABEL: _e2: ; X64: ## BB#0: -; X64-NEXT: vbroadcastss {{.*}}(%rip), %xmm0 +; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125] ; X64-NEXT: retq %vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0 %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1 diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll index eae7b94f5135c..b5a13404a2304 100644 --- a/test/CodeGen/X86/avx512-cmp.ll +++ b/test/CodeGen/X86/avx512-cmp.ll @@ -14,6 +14,7 @@ define double @test1(double %a, double %b) nounwind { ; ALL-NEXT: LBB0_2: ## %l2 ; ALL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; ALL-NEXT: retq +; ALL-NEXT: ## -- End function %tobool = fcmp une double %a, %b br i1 %tobool, label %l1, label %l2 @@ -36,6 +37,7 @@ define float @test2(float %a, float %b) nounwind { ; ALL-NEXT: LBB1_2: ## %l2 ; ALL-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; ALL-NEXT: retq +; ALL-NEXT: ## -- End function %tobool = fcmp olt float %a, %b br i1 %tobool, label %l1, label %l2 diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index 29a5325a0ae98..f858e7eb792fe 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -12,6 +12,7 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind { ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; KNL-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test1: ; SKX: ## BB#0: @@ -21,6 +22,7 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind { ; SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; SKX-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %rrr = load float, float* %br %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1 %rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14 @@ -36,6 +38,7 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind { ; KNL-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; KNL-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test2: ; SKX: ## BB#0: @@ -45,6 +48,7 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind { ; SKX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SKX-NEXT: vinsertf64x2 $3, %xmm0, %zmm2, %zmm0 ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %rrr = load double, double* %br %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1 %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6 @@ -58,6 +62,7 @@ define <16 x float> @test3(<16 x float> %x) nounwind { ; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3] ; KNL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test3: ; SKX: ## BB#0: @@ -65,6 +70,7 @@ define <16 x float> @test3(<16 x float> %x) nounwind { ; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3] ; SKX-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %eee = extractelement <16 x float> %x, i32 4 %rrr2 = insertelement <16 x float> %x, float %eee, i32 1 ret <16 x float> %rrr2 @@ -78,6 +84,7 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind { ; KNL-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1 ; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test4: ; SKX: ## BB#0: @@ -86,6 +93,7 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind { ; SKX-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1 ; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0 ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %eee = extractelement <8 x i64> %x, i32 4 %rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1 ret <8 x i64> %rrr2 @@ -96,11 +104,13 @@ define i32 @test5(<4 x float> %x) nounwind { ; KNL: ## BB#0: ; KNL-NEXT: vextractps $3, %xmm0, %eax ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test5: ; SKX: ## BB#0: ; SKX-NEXT: vextractps $3, %xmm0, %eax ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %ef = extractelement <4 x float> %x, i32 3 %ei = bitcast float %ef to i32 ret i32 %ei @@ -111,11 +121,13 @@ define void @test6(<4 x float> %x, float* %out) nounwind { ; KNL: ## BB#0: ; KNL-NEXT: vextractps $3, %xmm0, (%rdi) ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test6: ; SKX: ## BB#0: ; SKX-NEXT: vextractps $3, %xmm0, (%rdi) ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %ef = extractelement <4 x float> %x, i32 3 store float %ef, float* %out, align 4 ret void @@ -135,6 +147,7 @@ define float @test7(<16 x float> %x, i32 %ind) nounwind { ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test7: ; SKX: ## BB#0: @@ -150,6 +163,7 @@ define float @test7(<16 x float> %x, i32 %ind) nounwind { ; SKX-NEXT: popq %rbp ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %e = extractelement <16 x float> %x, i32 %ind ret float %e } @@ -168,6 +182,7 @@ define double @test8(<8 x double> %x, i32 %ind) nounwind { ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test8: ; SKX: ## BB#0: @@ -183,6 +198,7 @@ define double @test8(<8 x double> %x, i32 %ind) nounwind { ; SKX-NEXT: popq %rbp ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %e = extractelement <8 x double> %x, i32 %ind ret double %e } @@ -201,6 +217,7 @@ define float @test9(<8 x float> %x, i32 %ind) nounwind { ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test9: ; SKX: ## BB#0: @@ -216,6 +233,7 @@ define float @test9(<8 x float> %x, i32 %ind) nounwind { ; SKX-NEXT: popq %rbp ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %e = extractelement <8 x float> %x, i32 %ind ret float %e } @@ -234,6 +252,7 @@ define i32 @test10(<16 x i32> %x, i32 %ind) nounwind { ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test10: ; SKX: ## BB#0: @@ -249,6 +268,7 @@ define i32 @test10(<16 x i32> %x, i32 %ind) nounwind { ; SKX-NEXT: popq %rbp ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %e = extractelement <16 x i32> %x, i32 %ind ret i32 %e } @@ -1293,7 +1313,7 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) ; KNL: ## BB#0: ; KNL-NEXT: cmpl %esi, %edi ; KNL-NEXT: setb %al -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 @@ -1457,7 +1477,7 @@ define zeroext i8 @extractelement_v2i1_alt(<2 x i64> %a, <2 x i64> %b) { define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) { ; KNL-LABEL: test_extractelement_v4i1: ; KNL: ## BB#0: -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 @@ -2326,7 +2346,7 @@ define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, ; KNL-LABEL: test_extractelement_varible_v4i1: ; KNL: ## BB#0: ; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def> -; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 +; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] ; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll index 2b04b9229b3d2..b3fbceea80a94 100644 --- a/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -8,6 +8,7 @@ define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind { ; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1 ; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %mask = fcmp ole <16 x float> %x, %y %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y ret <16 x float> %max @@ -19,6 +20,7 @@ define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind { ; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1 ; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %mask = fcmp ole <8 x double> %x, %y %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y ret <8 x double> %max @@ -30,6 +32,7 @@ define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwin ; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1 ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %y = load <16 x i32>, <16 x i32>* %yp, align 4 %mask = icmp eq <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 @@ -42,6 +45,7 @@ define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) ; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %mask = icmp uge <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y ret <16 x i32> %max @@ -53,6 +57,7 @@ define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind { ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %mask = icmp eq <8 x i64> %x, %y %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y ret <8 x i64> %max @@ -64,6 +69,7 @@ define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1) noun ; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 ; CHECK-NEXT: vpblendmq %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %mask = icmp ugt <8 x i64> %x, %y %max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y ret <8 x i64> %max @@ -117,12 +123,14 @@ define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind { ; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test9: ; SKX: ## BB#0: ; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 ; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %mask = icmp eq <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y ret <8 x i32> %max @@ -137,12 +145,14 @@ define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind { ; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test10: ; SKX: ## BB#0: ; SKX-NEXT: vcmpeqps %ymm1, %ymm0, %k1 ; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %mask = fcmp oeq <8 x float> %x, %y %max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y @@ -154,6 +164,7 @@ define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind { ; CHECK: ## BB#0: ; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %mask = icmp ugt <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y ret <8 x i32> %max @@ -168,6 +179,7 @@ define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind { ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test12: ; SKX: ## BB#0: @@ -178,6 +190,7 @@ define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind { ; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %res = icmp eq <16 x i64> %a, %b %res1 = bitcast <16 x i1> %res to i16 ret i16 %res1 @@ -330,6 +343,7 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind { ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test12_v32i32: ; SKX: ## BB#0: @@ -339,6 +353,7 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind { ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %res = icmp eq <32 x i32> %a, %b %res1 = bitcast <32 x i1> %res to i32 ret i32 %res1 @@ -642,6 +657,7 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind { ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test12_v64i16: ; SKX: ## BB#0: @@ -651,6 +667,7 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind { ; SKX-NEXT: kmovq %k0, %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %res = icmp eq <64 x i16> %a, %b %res1 = bitcast <64 x i1> %res to i64 ret i64 %res1 @@ -704,6 +721,7 @@ define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind ; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1 ; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %mask = icmp sge <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y ret <16 x i32> %max @@ -715,6 +733,7 @@ define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou ; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1 ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask = icmp sgt <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 @@ -727,6 +746,7 @@ define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou ; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1 ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask = icmp sle <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 @@ -739,6 +759,7 @@ define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou ; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask = icmp ule <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 @@ -752,6 +773,7 @@ define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i3 ; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1} ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %mask1 = icmp eq <16 x i32> %x1, %y1 %mask0 = icmp eq <16 x i32> %x, %y %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer @@ -766,6 +788,7 @@ define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y ; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1} ; CHECK-NEXT: vpblendmq %zmm0, %zmm2, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %mask1 = icmp sge <8 x i64> %x1, %y1 %mask0 = icmp sle <8 x i64> %x, %y %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer @@ -780,6 +803,7 @@ define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i6 ; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1} ; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %mask1 = icmp sgt <8 x i64> %x1, %y1 %y = load <8 x i64>, <8 x i64>* %y.ptr, align 4 %mask0 = icmp sgt <8 x i64> %x, %y @@ -795,6 +819,7 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16 ; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1} ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %mask1 = icmp sge <16 x i32> %x1, %y1 %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4 %mask0 = icmp ule <16 x i32> %x, %y @@ -809,6 +834,7 @@ define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind { ; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1 ; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0 %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer @@ -823,6 +849,7 @@ define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind ; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1 ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0 %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer @@ -838,6 +865,7 @@ define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32 ; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1} ; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %mask1 = icmp sge <16 x i32> %x1, %y1 %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0 @@ -855,6 +883,7 @@ define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y ; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1} ; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %mask1 = icmp sge <8 x i64> %x1, %y1 %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0 @@ -920,12 +949,14 @@ define <4 x double> @test30(<4 x double> %x, <4 x double> %y) nounwind { ; KNL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2 ; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test30: ; SKX: ## BB#0: ; SKX-NEXT: vcmpeqpd %ymm1, %ymm0, %k1 ; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %mask = fcmp oeq <4 x double> %x, %y %max = select <4 x i1> %mask, <4 x double> %x, <4 x double> %y @@ -938,12 +969,14 @@ define <2 x double> @test31(<2 x double> %x, <2 x double> %x1, <2 x double>* %yp ; KNL-NEXT: vcmpltpd (%rdi), %xmm0, %xmm2 ; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test31: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi), %xmm0, %k1 ; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %y = load <2 x double>, <2 x double>* %yp, align 4 %mask = fcmp olt <2 x double> %x, %y @@ -957,12 +990,14 @@ define <4 x double> @test32(<4 x double> %x, <4 x double> %x1, <4 x double>* %yp ; KNL-NEXT: vcmpltpd (%rdi), %ymm0, %ymm2 ; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test32: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi), %ymm0, %k1 ; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %y = load <4 x double>, <4 x double>* %yp, align 4 %mask = fcmp ogt <4 x double> %y, %x @@ -976,6 +1011,7 @@ define <8 x double> @test33(<8 x double> %x, <8 x double> %x1, <8 x double>* %yp ; CHECK-NEXT: vcmpltpd (%rdi), %zmm0, %k1 ; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %y = load <8 x double>, <8 x double>* %yp, align 4 %mask = fcmp olt <8 x double> %x, %y %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %x1 @@ -988,12 +1024,14 @@ define <4 x float> @test34(<4 x float> %x, <4 x float> %x1, <4 x float>* %yp) no ; KNL-NEXT: vcmpltps (%rdi), %xmm0, %xmm2 ; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test34: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi), %xmm0, %k1 ; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %y = load <4 x float>, <4 x float>* %yp, align 4 %mask = fcmp olt <4 x float> %x, %y %max = select <4 x i1> %mask, <4 x float> %x, <4 x float> %x1 @@ -1010,12 +1048,14 @@ define <8 x float> @test35(<8 x float> %x, <8 x float> %x1, <8 x float>* %yp) no ; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test35: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi), %ymm0, %k1 ; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %y = load <8 x float>, <8 x float>* %yp, align 4 %mask = fcmp ogt <8 x float> %y, %x @@ -1029,6 +1069,7 @@ define <16 x float> @test36(<16 x float> %x, <16 x float> %x1, <16 x float>* %yp ; CHECK-NEXT: vcmpltps (%rdi), %zmm0, %k1 ; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %y = load <16 x float>, <16 x float>* %yp, align 4 %mask = fcmp olt <16 x float> %x, %y %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %x1 @@ -1041,6 +1082,7 @@ define <8 x double> @test37(<8 x double> %x, <8 x double> %x1, double* %ptr) nou ; CHECK-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 ; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %a = load double, double* %ptr %v = insertelement <8 x double> undef, double %a, i32 0 @@ -1058,12 +1100,14 @@ define <4 x double> @test38(<4 x double> %x, <4 x double> %x1, double* %ptr) nou ; KNL-NEXT: vcmpltpd %ymm2, %ymm0, %ymm2 ; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test38: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi){1to4}, %ymm0, %k1 ; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %a = load double, double* %ptr %v = insertelement <4 x double> undef, double %a, i32 0 @@ -1081,12 +1125,14 @@ define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, double* %ptr) nou ; KNL-NEXT: vcmpltpd %xmm2, %xmm0, %xmm2 ; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test39: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltpd (%rdi){1to2}, %xmm0, %k1 ; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %a = load double, double* %ptr %v = insertelement <2 x double> undef, double %a, i32 0 @@ -1104,6 +1150,7 @@ define <16 x float> @test40(<16 x float> %x, <16 x float> %x1, float* %ptr) n ; CHECK-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1 ; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; CHECK-NEXT: retq +; CHECK-NEXT: ## -- End function %a = load float, float* %ptr %v = insertelement <16 x float> undef, float %a, i32 0 @@ -1124,12 +1171,14 @@ define <8 x float> @test41(<8 x float> %x, <8 x float> %x1, float* %ptr) noun ; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} ; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test41: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi){1to8}, %ymm0, %k1 ; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %a = load float, float* %ptr %v = insertelement <8 x float> undef, float %a, i32 0 @@ -1147,12 +1196,14 @@ define <4 x float> @test42(<4 x float> %x, <4 x float> %x1, float* %ptr) noun ; KNL-NEXT: vcmpltps %xmm2, %xmm0, %xmm2 ; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test42: ; SKX: ## BB#0: ; SKX-NEXT: vcmpltps (%rdi){1to4}, %xmm0, %k1 ; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %a = load float, float* %ptr %v = insertelement <4 x float> undef, float %a, i32 0 @@ -1172,6 +1223,7 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x ; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1} ; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; KNL-NEXT: retq +; KNL-NEXT: ## -- End function ; ; SKX-LABEL: test43: ; SKX: ## BB#0: @@ -1180,6 +1232,7 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x ; SKX-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1} ; SKX-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ; SKX-NEXT: retq +; SKX-NEXT: ## -- End function %a = load double, double* %ptr %v = insertelement <8 x double> undef, double %a, i32 0 diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll index e0acf2be653e2..43b1f53a09fae 100644 --- a/test/CodeGen/X86/avx512vl-vec-cmp.ll +++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll @@ -1,56 +1,98 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=VLX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=NoVLX define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind { -; CHECK-LABEL: test256_1: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_1: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_1: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2 +; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: retq %mask = icmp eq <4 x i64> %x, %y %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y ret <4 x i64> %max } define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind { -; CHECK-LABEL: test256_2: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_2: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 +; VLX-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_2: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 +; NoVLX-NEXT: retq %mask = icmp sgt <4 x i64> %x, %y %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y ret <4 x i64> %max } define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind { -; CHECK-LABEL: test256_3: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_3: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k1 +; VLX-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_3: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def> +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k1 +; NoVLX-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %mask = icmp sge <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y ret <8 x i32> %max } define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind { -; CHECK-LABEL: test256_4: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_4: +; VLX: # BB#0: +; VLX-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1 +; VLX-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_4: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm3, %ymm1, %ymm4 +; NoVLX-NEXT: vpxor %ymm3, %ymm0, %ymm0 +; NoVLX-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm0 +; NoVLX-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 +; NoVLX-NEXT: retq %mask = icmp ugt <4 x i64> %x, %y %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y ret <4 x i64> %max } define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind { -; CHECK-LABEL: test256_5: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_5: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_5: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp eq <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -58,11 +100,21 @@ define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwin } define <8 x i32> @test256_5b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind { -; CHECK-LABEL: test256_5b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_5b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_5b: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpeqd %zmm0, %zmm2, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp eq <8 x i32> %y, %x %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -70,11 +122,21 @@ define <8 x i32> @test256_5b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi } define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test256_6: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_6: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_6: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpgtd %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp sgt <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -82,11 +144,21 @@ define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun } define <8 x i32> @test256_6b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test256_6b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_6b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_6b: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpgtd %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp slt <8 x i32> %y, %x %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -94,11 +166,21 @@ define <8 x i32> @test256_6b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou } define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test256_7: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_7: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_7: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpled %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp sle <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -106,11 +188,21 @@ define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun } define <8 x i32> @test256_7b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test256_7b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_7b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_7b: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpled %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp sge <8 x i32> %y, %x %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -118,11 +210,21 @@ define <8 x i32> @test256_7b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou } define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test256_8: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_8: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_8: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpleud %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp ule <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -130,11 +232,21 @@ define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun } define <8 x i32> @test256_8b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test256_8b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_8b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_8b: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpnltud %zmm0, %zmm2, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask = icmp uge <8 x i32> %y, %x %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -142,12 +254,25 @@ define <8 x i32> @test256_8b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou } define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> %y1) nounwind { -; CHECK-LABEL: test256_9: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1} -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_9: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; VLX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1} +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_9: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM3<def> %YMM3<kill> %ZMM3<def> +; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def> +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpeqd %zmm3, %zmm2, %k0 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %mask1 = icmp eq <8 x i32> %x1, %y1 %mask0 = icmp eq <8 x i32> %x, %y %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer @@ -156,12 +281,22 @@ define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> } define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) nounwind { -; CHECK-LABEL: test256_10: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k1 -; CHECK-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1} -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_10: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleq %ymm1, %ymm0, %k1 +; VLX-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1} +; VLX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_10: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm3 +; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 +; NoVLX-NEXT: vpandn %ymm3, %ymm1, %ymm1 +; NoVLX-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: retq %mask1 = icmp sge <4 x i64> %x1, %y1 %mask0 = icmp sle <4 x i64> %x, %y %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer @@ -170,12 +305,20 @@ define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64 } define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind { -; CHECK-LABEL: test256_11: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_11: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtq %ymm2, %ymm1, %k1 +; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1} +; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_11: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm3 +; NoVLX-NEXT: vpand %ymm2, %ymm3, %ymm2 +; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: retq %mask1 = icmp sgt <4 x i64> %x1, %y1 %y = load <4 x i64>, <4 x i64>* %y.ptr, align 4 %mask0 = icmp sgt <4 x i64> %x, %y @@ -185,12 +328,25 @@ define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 } define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind { -; CHECK-LABEL: test256_12: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1 -; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1} -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_12: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled %ymm1, %ymm2, %k1 +; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1} +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_12: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def> +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpled %zmm1, %zmm2, %k0 +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpleud %zmm2, %zmm0, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %mask1 = icmp sge <8 x i32> %x1, %y1 %y = load <8 x i32>, <8 x i32>* %y.ptr, align 4 %mask0 = icmp ule <8 x i32> %x, %y @@ -200,11 +356,18 @@ define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 } define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind { -; CHECK-LABEL: test256_13: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_13: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1 +; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_13: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm2 +; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: retq %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0 %y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer @@ -214,11 +377,21 @@ define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind } define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind { -; CHECK-LABEL: test256_14: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_14: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_14: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpled %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0 %y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer @@ -228,12 +401,25 @@ define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind } define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind { -; CHECK-LABEL: test256_15: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1 -; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1} -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_15: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled %ymm1, %ymm2, %k1 +; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1} +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_15: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def> +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpled %zmm1, %zmm2, %k0 +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpgtd %zmm2, %zmm0, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %mask1 = icmp sge <8 x i32> %x1, %y1 %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0 @@ -245,12 +431,21 @@ define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32 } define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind { -; CHECK-LABEL: test256_16: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleq %ymm1, %ymm2, %k1 -; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1} -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_16: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleq %ymm1, %ymm2, %k1 +; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1} +; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_16: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm3 +; NoVLX-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm3 +; NoVLX-NEXT: vpandn %ymm3, %ymm2, %ymm2 +; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: retq %mask1 = icmp sge <4 x i64> %x1, %y1 %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0 @@ -262,11 +457,21 @@ define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64 } define <8 x i32> @test256_17(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind { -; CHECK-LABEL: test256_17: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_17: +; VLX: # BB#0: +; VLX-NEXT: vpcmpneqd (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_17: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpneqd %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp ne <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -274,11 +479,21 @@ define <8 x i32> @test256_17(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi } define <8 x i32> @test256_18(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind { -; CHECK-LABEL: test256_18: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_18: +; VLX: # BB#0: +; VLX-NEXT: vpcmpneqd (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_18: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpneqd %zmm0, %zmm2, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp ne <8 x i32> %y, %x %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -286,11 +501,21 @@ define <8 x i32> @test256_18(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi } define <8 x i32> @test256_19(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind { -; CHECK-LABEL: test256_19: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpnltud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_19: +; VLX: # BB#0: +; VLX-NEXT: vpcmpnltud (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_19: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpnltud %zmm2, %zmm0, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp uge <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -298,11 +523,21 @@ define <8 x i32> @test256_19(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi } define <8 x i32> @test256_20(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind { -; CHECK-LABEL: test256_20: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test256_20: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1 +; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test256_20: +; NoVLX: # BB#0: +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqu (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpnltud %zmm0, %zmm2, %k1 +; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; NoVLX-NEXT: retq %y = load <8 x i32>, <8 x i32>* %yp, align 4 %mask = icmp uge <8 x i32> %y, %x %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 @@ -310,55 +545,90 @@ define <8 x i32> @test256_20(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi } define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind { -; CHECK-LABEL: test128_1: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_1: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 +; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_1: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2 +; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask = icmp eq <2 x i64> %x, %y %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y ret <2 x i64> %max } define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind { -; CHECK-LABEL: test128_2: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_2: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 +; VLX-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_2: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask = icmp sgt <2 x i64> %x, %y %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y ret <2 x i64> %max } define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind { -; CHECK-LABEL: test128_3: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_3: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k1 +; VLX-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_3: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; NoVLX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask = icmp sge <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y ret <4 x i32> %max } define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind { -; CHECK-LABEL: test128_4: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_4: +; VLX: # BB#0: +; VLX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 +; VLX-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_4: +; NoVLX: # BB#0: +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm3, %xmm1, %xmm4 +; NoVLX-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; NoVLX-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm0 +; NoVLX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask = icmp ugt <2 x i64> %x, %y %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y ret <2 x i64> %max } define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind { -; CHECK-LABEL: test128_5: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_5: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_5: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %yp, align 4 %mask = icmp eq <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -366,11 +636,17 @@ define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwin } define <4 x i32> @test128_5b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind { -; CHECK-LABEL: test128_5b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_5b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_5b: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %yp, align 4 %mask = icmp eq <4 x i32> %y, %x %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -378,11 +654,17 @@ define <4 x i32> @test128_5b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwi } define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_6: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_6: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_6: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp sgt <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -390,11 +672,17 @@ define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun } define <4 x i32> @test128_6b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_6b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_6b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_6b: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp slt <4 x i32> %y, %x %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -402,11 +690,19 @@ define <4 x i32> @test128_6b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou } define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_7: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_7: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_7: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp sle <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -414,11 +710,19 @@ define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun } define <4 x i32> @test128_7b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_7b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_7b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_7b: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp sge <4 x i32> %y, %x %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -426,11 +730,18 @@ define <4 x i32> @test128_7b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou } define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_8: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_8: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_8: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpminud (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp ule <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -438,11 +749,19 @@ define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun } define <4 x i32> @test128_8b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_8b: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_8b: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_8b: +; NoVLX: # BB#0: +; NoVLX-NEXT: vmovdqu (%rdi), %xmm2 +; NoVLX-NEXT: vpmaxud %xmm0, %xmm2, %xmm3 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp uge <4 x i32> %y, %x %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -450,12 +769,20 @@ define <4 x i32> @test128_8b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou } define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> %y1) nounwind { -; CHECK-LABEL: test128_9: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1} -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_9: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; VLX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1} +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_9: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm3 +; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask1 = icmp eq <4 x i32> %x1, %y1 %mask0 = icmp eq <4 x i32> %x, %y %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer @@ -464,12 +791,22 @@ define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> } define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) nounwind { -; CHECK-LABEL: test128_10: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k1 -; CHECK-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1} -; CHECK-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_10: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleq %xmm1, %xmm0, %k1 +; VLX-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1} +; VLX-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_10: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm3 +; NoVLX-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; NoVLX-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; NoVLX-NEXT: vpandn %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; NoVLX-NEXT: retq %mask1 = icmp sge <2 x i64> %x1, %y1 %mask0 = icmp sle <2 x i64> %x, %y %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer @@ -478,12 +815,20 @@ define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64 } define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind { -; CHECK-LABEL: test128_11: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpgtq %xmm2, %xmm1, %k1 -; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_11: +; VLX: # BB#0: +; VLX-NEXT: vpcmpgtq %xmm2, %xmm1, %k1 +; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1} +; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_11: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm3 +; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2 +; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask1 = icmp sgt <2 x i64> %x1, %y1 %y = load <2 x i64>, <2 x i64>* %y.ptr, align 4 %mask0 = icmp sgt <2 x i64> %x, %y @@ -493,12 +838,21 @@ define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 } define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind { -; CHECK-LABEL: test128_12: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1 -; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_12: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled %xmm1, %xmm2, %k1 +; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1} +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_12: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 +; NoVLX-NEXT: vpminud (%rdi), %xmm0, %xmm3 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3 +; NoVLX-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask1 = icmp sge <4 x i32> %x1, %y1 %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask0 = icmp ule <4 x i32> %x, %y @@ -508,11 +862,18 @@ define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 } define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind { -; CHECK-LABEL: test128_13: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1 -; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_13: +; VLX: # BB#0: +; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1 +; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_13: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm2 +; NoVLX-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm2 +; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0 %y = insertelement <2 x i64> %y.0, i64 %yb, i32 1 @@ -522,11 +883,20 @@ define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind } define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind { -; CHECK-LABEL: test128_14: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_14: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_14: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm2 +; NoVLX-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0 %y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer @@ -536,12 +906,21 @@ define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind } define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind { -; CHECK-LABEL: test128_15: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1 -; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_15: +; VLX: # BB#0: +; VLX-NEXT: vpcmpled %xmm1, %xmm2, %k1 +; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1} +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_15: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm3 +; NoVLX-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm3 +; NoVLX-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask1 = icmp sge <4 x i32> %x1, %y1 %yb = load i32, i32* %yb.ptr, align 4 %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0 @@ -553,12 +932,21 @@ define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32 } define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind { -; CHECK-LABEL: test128_16: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleq %xmm1, %xmm2, %k1 -; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1} -; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_16: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleq %xmm1, %xmm2, %k1 +; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1} +; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_16: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2 +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm3 +; NoVLX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 +; NoVLX-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %mask1 = icmp sge <2 x i64> %x1, %y1 %yb = load i64, i64* %yb.ptr, align 4 %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0 @@ -570,11 +958,19 @@ define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64 } define <4 x i32> @test128_17(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_17: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_17: +; VLX: # BB#0: +; VLX-NEXT: vpcmpneqd (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_17: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp ne <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -582,11 +978,19 @@ define <4 x i32> @test128_17(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou } define <4 x i32> @test128_18(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_18: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_18: +; VLX: # BB#0: +; VLX-NEXT: vpcmpneqd (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_18: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp ne <4 x i32> %y, %x %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -594,11 +998,18 @@ define <4 x i32> @test128_18(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou } define <4 x i32> @test128_19(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_19: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpnltud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_19: +; VLX: # BB#0: +; VLX-NEXT: vpcmpnltud (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_19: +; NoVLX: # BB#0: +; NoVLX-NEXT: vpmaxud (%rdi), %xmm0, %xmm2 +; NoVLX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp uge <4 x i32> %x, %y %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 @@ -606,11 +1017,19 @@ define <4 x i32> @test128_19(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou } define <4 x i32> @test128_20(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { -; CHECK-LABEL: test128_20: -; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: retq +; VLX-LABEL: test128_20: +; VLX: # BB#0: +; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1 +; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; VLX-NEXT: retq +; +; NoVLX-LABEL: test128_20: +; NoVLX: # BB#0: +; NoVLX-NEXT: vmovdqu (%rdi), %xmm2 +; NoVLX-NEXT: vpmaxud %xmm0, %xmm2, %xmm3 +; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: retq %y = load <4 x i32>, <4 x i32>* %y.ptr, align 4 %mask = icmp uge <4 x i32> %y, %x %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll index f297fc3db95fa..4d3a1495617ea 100644 --- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -1,13 +1,124 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=NoVLX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=VLX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=NoVLX define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi0: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi2: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi3: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi4: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi5: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi6: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi7: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -18,11 +129,122 @@ entry: } define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqb (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqb (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi8: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi9: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi10: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi11: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi12: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi13: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi14: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi15: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -34,12 +256,124 @@ entry: } define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi16: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi17: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi18: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi19: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi20: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi21: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi22: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi23: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -52,12 +386,124 @@ entry: } define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi24: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi25: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi26: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi27: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi28: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi29: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi30: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi31: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -72,11 +518,127 @@ entry: define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi32: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi33: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi34: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi35: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi36: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi37: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi38: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi39: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -87,11 +649,127 @@ entry: } define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqb (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqb (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi40: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi41: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi42: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi43: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi44: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi45: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi46: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi47: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -103,12 +781,129 @@ entry: } define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi48: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi49: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi50: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi51: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi52: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi53: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi54: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi55: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -121,12 +916,129 @@ entry: } define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi56: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi57: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi58: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi59: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi60: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi61: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi62: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi63: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -141,12 +1053,46 @@ entry: define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi64: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi65: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi66: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -157,12 +1103,46 @@ entry: } define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqb (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqb (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi67: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi68: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi69: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqb (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -174,13 +1154,56 @@ entry: } define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi70: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi71: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi72: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 +; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -193,13 +1216,56 @@ entry: } define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqb (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqb (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi73: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi74: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi75: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; NoVLX-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -214,11 +1280,24 @@ entry: define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -229,11 +1308,24 @@ entry: } define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -245,12 +1337,26 @@ entry: } define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -263,12 +1369,26 @@ entry: } define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -283,11 +1403,72 @@ entry: define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi76: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi77: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi78: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -298,11 +1479,72 @@ entry: } define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi79: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi80: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi81: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -314,12 +1556,74 @@ entry: } define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi82: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi83: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi84: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -332,12 +1636,74 @@ entry: } define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi85: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi86: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi87: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -352,11 +1718,77 @@ entry: define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi88: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi89: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi90: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -367,11 +1799,77 @@ entry: } define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi91: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi92: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi93: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -383,12 +1881,79 @@ entry: } define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi94: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi95: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi96: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -401,12 +1966,79 @@ entry: } define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi97: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi98: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi99: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -421,12 +2053,123 @@ entry: define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi100: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi101: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi102: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi103: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi104: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi105: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi106: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi107: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -437,12 +2180,123 @@ entry: } define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi108: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi109: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi110: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi111: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi112: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi113: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi114: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi115: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -454,13 +2308,125 @@ entry: } define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi116: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi117: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi118: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi119: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi120: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi121: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi122: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi123: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -473,13 +2439,125 @@ entry: } define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi124: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi125: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi126: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi127: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi128: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi129: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi130: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi131: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -494,12 +2572,128 @@ entry: define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi132: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi133: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi134: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi135: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi136: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi137: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi138: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi139: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -510,12 +2704,128 @@ entry: } define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi140: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi141: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi142: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi143: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi144: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi145: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi146: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi147: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -527,13 +2837,130 @@ entry: } define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi148: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi149: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi150: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi151: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi152: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi153: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi154: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi155: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -546,13 +2973,130 @@ entry: } define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi156: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi157: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi158: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi159: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi160: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi161: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi162: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi163: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -567,12 +3111,348 @@ entry: define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi164: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi165: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi166: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -583,12 +3463,263 @@ entry: } define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqw (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqw (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi167: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi168: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi169: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpeqw 32(%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -600,13 +3731,358 @@ entry: } define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi170: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi171: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi172: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} +; NoVLX-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm6, %xmm6 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm8, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -619,13 +4095,273 @@ entry: } define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqw (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqw (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi173: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi174: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi175: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm2 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpcmpeqw 32(%rsi), %ymm4, %ymm4 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4 +; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4 +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -640,11 +4376,51 @@ entry: define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -655,11 +4431,51 @@ entry: } define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -671,12 +4487,70 @@ entry: } define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -690,12 +4564,70 @@ entry: } define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -711,11 +4643,52 @@ entry: define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -728,12 +4701,71 @@ entry: } define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -750,11 +4782,50 @@ entry: define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -765,11 +4836,50 @@ entry: } define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -781,12 +4891,69 @@ entry: } define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -800,12 +4967,69 @@ entry: } define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -821,11 +5045,51 @@ entry: define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -838,12 +5102,70 @@ entry: } define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -860,11 +5182,39 @@ entry: define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi176: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi177: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi178: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -875,11 +5225,39 @@ entry: } define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi179: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi180: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi181: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -891,12 +5269,58 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi182: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi183: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi184: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -910,12 +5334,58 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi185: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi186: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi187: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -931,11 +5401,40 @@ entry: define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi188: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi189: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi190: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -948,12 +5447,59 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi191: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi192: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi193: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -970,11 +5516,46 @@ entry: define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi194: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi195: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi196: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -985,11 +5566,46 @@ entry: } define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi197: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi198: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi199: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -1001,12 +5617,65 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi200: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi201: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi202: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -1020,12 +5689,65 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi203: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi204: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi205: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -1041,11 +5763,47 @@ entry: define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi206: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi207: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi208: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -1058,12 +5816,66 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi209: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi210: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi211: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -1080,21 +5892,23 @@ entry: define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def> -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -1106,21 +5920,23 @@ entry: } define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -1133,23 +5949,25 @@ entry: } define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def> -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -1163,23 +5981,25 @@ entry: } define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -1195,21 +6015,23 @@ entry: define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -1223,23 +6045,25 @@ entry: } define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -1256,12 +6080,72 @@ entry: define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi212: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi213: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi214: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -1272,12 +6156,72 @@ entry: } define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi215: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi216: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi217: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -1289,13 +6233,75 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi218: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi219: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi220: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -1308,13 +6314,75 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi221: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi222: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi223: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -1329,12 +6397,72 @@ entry: define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi224: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi225: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi226: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -1347,13 +6475,75 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi227: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi228: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi229: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -1369,12 +6559,77 @@ entry: define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi230: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi231: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi232: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -1385,12 +6640,77 @@ entry: } define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi233: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi234: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi235: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -1402,13 +6722,80 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi236: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi237: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi238: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -1421,13 +6808,80 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi239: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi240: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi241: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -1442,12 +6896,77 @@ entry: define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi242: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi243: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi244: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -1460,13 +6979,80 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi245: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi246: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi247: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -1482,12 +7068,120 @@ entry: define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi248: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi249: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi250: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi251: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi252: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi253: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi254: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi255: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -1498,12 +7192,120 @@ entry: } define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi256: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi257: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi258: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi259: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi260: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi261: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi262: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi263: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -1515,13 +7317,122 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi264: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi265: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi266: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi267: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi268: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi269: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi270: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi271: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -1534,13 +7445,122 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi272: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi273: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi274: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi275: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi276: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi277: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi278: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi279: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -1555,12 +7575,120 @@ entry: define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi280: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi281: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi282: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi283: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi284: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi285: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi286: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi287: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -1573,13 +7701,122 @@ entry: } define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi288: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi289: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi290: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi291: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi292: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi293: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi294: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi295: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -1595,12 +7832,125 @@ entry: define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi296: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi297: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi298: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi299: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi300: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi301: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi302: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi303: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -1611,12 +7961,125 @@ entry: } define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi304: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi305: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi306: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi307: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi308: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi309: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi310: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi311: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -1628,13 +8091,127 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi312: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi313: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi314: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi315: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi316: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi317: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi318: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi319: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -1647,13 +8224,127 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi320: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi321: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi322: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi323: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi324: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi325: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi326: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi327: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -1668,12 +8359,125 @@ entry: define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi328: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi329: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi330: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi331: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi332: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi333: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi334: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi335: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -1686,13 +8490,127 @@ entry: } define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi336: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi337: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi338: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi339: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi340: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi341: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi342: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi343: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -1708,12 +8626,23 @@ entry: define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -1724,12 +8653,23 @@ entry: } define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -1741,13 +8681,34 @@ entry: } define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -1761,13 +8722,34 @@ entry: } define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -1783,12 +8765,24 @@ entry: define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -1801,13 +8795,35 @@ entry: } define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -1824,11 +8840,35 @@ entry: define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -1839,11 +8879,35 @@ entry: } define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -1855,12 +8919,46 @@ entry: } define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -1874,12 +8972,46 @@ entry: } define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -1895,11 +9027,36 @@ entry: define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -1912,12 +9069,47 @@ entry: } define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -1934,11 +9126,34 @@ entry: define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -1949,11 +9164,34 @@ entry: } define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -1965,12 +9203,45 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -1984,12 +9255,45 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -2005,11 +9309,35 @@ entry: define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -2022,12 +9350,46 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -2044,11 +9406,39 @@ entry: define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi344: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi345: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi346: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -2059,11 +9449,39 @@ entry: } define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi347: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi348: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi349: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -2075,12 +9493,50 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi350: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi351: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi352: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -2094,12 +9550,50 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi353: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi354: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi355: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -2115,11 +9609,40 @@ entry: define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi356: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi357: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi358: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -2132,12 +9655,51 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi359: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi360: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi361: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -2154,11 +9716,46 @@ entry: define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi362: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi363: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi364: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -2169,11 +9766,46 @@ entry: } define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi365: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi366: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi367: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -2185,12 +9817,57 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi368: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi369: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi370: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -2204,12 +9881,57 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi371: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi372: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi373: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -2225,11 +9947,47 @@ entry: define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi374: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi375: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi376: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -2242,12 +10000,58 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi377: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi378: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi379: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -2264,12 +10068,53 @@ entry: define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2280,12 +10125,53 @@ entry: } define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2297,13 +10183,72 @@ entry: } define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2317,13 +10262,72 @@ entry: } define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2339,12 +10343,54 @@ entry: define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2357,13 +10403,73 @@ entry: } define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2380,12 +10486,52 @@ entry: define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2396,12 +10542,52 @@ entry: } define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2413,13 +10599,71 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2433,13 +10677,71 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2455,12 +10757,53 @@ entry: define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2473,13 +10816,72 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2496,12 +10898,41 @@ entry: define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi380: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi381: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi382: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2512,12 +10943,41 @@ entry: } define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi383: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi384: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi385: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2529,13 +10989,60 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi386: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi387: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi388: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2549,13 +11056,60 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi389: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi390: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi391: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2571,12 +11125,42 @@ entry: define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi392: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi393: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi394: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2589,13 +11173,61 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi395: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi396: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi397: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2612,12 +11244,48 @@ entry: define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi398: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi399: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi400: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2628,12 +11296,48 @@ entry: } define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi401: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi402: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi403: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2645,13 +11349,67 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi404: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi405: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi406: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -2665,13 +11423,67 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi407: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi408: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi409: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -2687,12 +11499,49 @@ entry: define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi410: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi411: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi412: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2705,13 +11554,68 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi413: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi414: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi415: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -2728,12 +11632,20 @@ entry: define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -2744,12 +11656,20 @@ entry: } define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -2761,13 +11681,22 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -2780,13 +11709,22 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -2801,12 +11739,20 @@ entry: define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -2819,13 +11765,22 @@ entry: } define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -2841,12 +11796,70 @@ entry: define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi416: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi417: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi418: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -2857,12 +11870,70 @@ entry: } define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi419: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi420: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi421: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -2874,13 +11945,72 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi422: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi423: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi424: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -2893,13 +12023,72 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi425: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi426: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi427: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -2914,12 +12103,70 @@ entry: define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi428: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi429: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi430: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -2932,13 +12179,72 @@ entry: } define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi431: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi432: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi433: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -2954,12 +12260,75 @@ entry: define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi434: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi435: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi436: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -2970,12 +12339,75 @@ entry: } define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi437: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi438: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi439: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -2987,13 +12419,77 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi440: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi441: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi442: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -3006,13 +12502,77 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi443: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi444: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi445: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -3027,12 +12587,75 @@ entry: define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi446: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi447: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi448: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -3045,13 +12668,77 @@ entry: } define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi449: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi450: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi451: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -3067,11 +12754,122 @@ entry: define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi452: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi453: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi454: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi455: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi456: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi457: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi458: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi459: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -3082,11 +12880,122 @@ entry: } define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtb (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi460: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi461: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi462: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi463: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi464: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi465: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi466: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi467: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -3098,12 +13007,124 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi468: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi469: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi470: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi471: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi472: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi473: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi474: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi475: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -3116,12 +13137,124 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi476: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi477: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi478: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi479: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi480: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi481: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi482: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi483: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -3136,11 +13269,127 @@ entry: define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi484: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi485: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi486: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi487: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi488: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi489: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi490: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi491: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -3151,11 +13400,127 @@ entry: } define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtb (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi492: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi493: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi494: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi495: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi496: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi497: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi498: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi499: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -3167,12 +13532,129 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi500: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi501: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi502: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi503: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi504: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi505: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi506: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi507: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -3185,12 +13667,129 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi508: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi509: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi510: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi511: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi512: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi513: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi514: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi515: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -3205,12 +13804,46 @@ entry: define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi516: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi517: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi518: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -3221,12 +13854,46 @@ entry: } define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtb (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtb (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi519: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi520: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi521: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtb (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -3238,13 +13905,56 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi522: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi523: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi524: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 +; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; NoVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -3257,13 +13967,56 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtb (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtb (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi525: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi526: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi527: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; NoVLX-NEXT: vpcmpgtb (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -3278,11 +14031,24 @@ entry: define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -3293,11 +14059,24 @@ entry: } define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -3309,12 +14088,26 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -3327,12 +14120,26 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -3347,11 +14154,72 @@ entry: define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi528: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi529: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi530: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -3362,11 +14230,72 @@ entry: } define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi531: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi532: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi533: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -3378,12 +14307,74 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi534: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi535: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi536: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -3396,12 +14387,74 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi537: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi538: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi539: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -3416,11 +14469,77 @@ entry: define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi540: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi541: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi542: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -3431,11 +14550,77 @@ entry: } define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi543: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi544: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi545: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -3447,12 +14632,79 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi546: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi547: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi548: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -3465,12 +14717,79 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi549: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi550: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi551: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -3485,12 +14804,123 @@ entry: define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi552: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi553: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi554: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi555: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi556: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi557: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi558: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi559: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -3501,12 +14931,123 @@ entry: } define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi560: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi561: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi562: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi563: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi564: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi565: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi566: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi567: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -3518,13 +15059,125 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi568: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi569: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi570: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi571: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi572: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi573: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi574: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi575: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -3537,13 +15190,125 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi576: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi577: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi578: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi579: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi580: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi581: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi582: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi583: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -3558,12 +15323,128 @@ entry: define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi584: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi585: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi586: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi587: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi588: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi589: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi590: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi591: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -3574,12 +15455,128 @@ entry: } define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi592: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi593: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi594: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi595: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi596: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi597: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi598: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi599: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -3591,13 +15588,130 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi600: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi601: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi602: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi603: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi604: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi605: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi606: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi607: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -3610,13 +15724,130 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi608: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi609: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi610: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi611: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi612: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi613: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi614: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi615: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -3631,12 +15862,348 @@ entry: define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi616: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi617: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi618: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -3647,12 +16214,263 @@ entry: } define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtw (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtw (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi619: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi620: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi621: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw 32(%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -3664,13 +16482,358 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi622: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi623: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi624: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm6, %xmm6 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm8, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -3683,13 +16846,273 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtw (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtw (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi625: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi626: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi627: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm2 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpcmpgtw 32(%rsi), %ymm4, %ymm4 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4 +; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4 +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -3704,11 +17127,51 @@ entry: define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -3719,11 +17182,51 @@ entry: } define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -3735,12 +17238,70 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -3754,12 +17315,70 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -3775,11 +17394,52 @@ entry: define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -3792,12 +17452,71 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -3814,11 +17533,50 @@ entry: define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -3829,11 +17587,50 @@ entry: } define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -3845,12 +17642,69 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -3864,12 +17718,69 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -3885,11 +17796,51 @@ entry: define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -3902,12 +17853,70 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -3924,11 +17933,39 @@ entry: define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi628: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi629: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi630: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -3939,11 +17976,39 @@ entry: } define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi631: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi632: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi633: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -3955,12 +18020,58 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi634: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi635: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi636: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -3974,12 +18085,58 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi637: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi638: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi639: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -3995,11 +18152,40 @@ entry: define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi640: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi641: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi642: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -4012,12 +18198,59 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi643: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi644: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi645: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -4034,11 +18267,46 @@ entry: define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi646: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi647: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi648: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -4049,11 +18317,46 @@ entry: } define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi649: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi650: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi651: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -4065,12 +18368,65 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi652: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi653: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi654: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -4084,12 +18440,65 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi655: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi656: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi657: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -4105,11 +18514,47 @@ entry: define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi658: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi659: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi660: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -4122,12 +18567,66 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi661: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi662: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi663: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -4144,21 +18643,23 @@ entry: define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def> -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -4170,21 +18671,23 @@ entry: } define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -4197,23 +18700,25 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def> -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -4227,23 +18732,25 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -4259,21 +18766,23 @@ entry: define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -4287,23 +18796,25 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -4320,12 +18831,72 @@ entry: define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi664: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi665: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi666: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -4336,12 +18907,72 @@ entry: } define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi667: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi668: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi669: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -4353,13 +18984,75 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi670: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi671: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi672: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -4372,13 +19065,75 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi673: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi674: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi675: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -4393,12 +19148,72 @@ entry: define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi676: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi677: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi678: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -4411,13 +19226,75 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi679: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi680: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi681: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -4433,12 +19310,77 @@ entry: define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi682: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi683: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi684: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -4449,12 +19391,77 @@ entry: } define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi685: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi686: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi687: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -4466,13 +19473,80 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi688: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi689: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi690: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -4485,13 +19559,80 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi691: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi692: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi693: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -4506,12 +19647,77 @@ entry: define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi694: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi695: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi696: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -4524,13 +19730,80 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi697: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi698: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi699: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -4546,12 +19819,120 @@ entry: define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi700: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi701: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi702: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi703: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi704: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi705: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi706: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi707: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -4562,12 +19943,120 @@ entry: } define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi708: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi709: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi710: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi711: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi712: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi713: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi714: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi715: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -4579,13 +20068,122 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi716: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi717: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi718: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi719: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi720: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi721: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi722: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi723: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -4598,13 +20196,122 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi724: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi725: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi726: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi727: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi728: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi729: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi730: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi731: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -4619,12 +20326,120 @@ entry: define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi732: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi733: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi734: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi735: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi736: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi737: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi738: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi739: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -4637,13 +20452,122 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi740: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi741: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi742: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi743: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi744: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi745: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi746: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi747: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -4659,12 +20583,125 @@ entry: define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi748: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi749: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi750: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi751: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi752: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi753: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi754: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi755: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -4675,12 +20712,125 @@ entry: } define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi756: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi757: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi758: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi759: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi760: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi761: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi762: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi763: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -4692,13 +20842,127 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi764: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi765: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi766: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi767: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi768: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi769: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi770: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi771: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -4711,13 +20975,127 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi772: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi773: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi774: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi775: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi776: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi777: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi778: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi779: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -4732,12 +21110,125 @@ entry: define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi780: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi781: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi782: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi783: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi784: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi785: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi786: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi787: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -4750,13 +21241,127 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi788: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi789: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi790: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi791: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi792: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi793: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi794: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi795: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -4772,12 +21377,23 @@ entry: define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -4788,12 +21404,23 @@ entry: } define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -4805,13 +21432,34 @@ entry: } define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -4825,13 +21473,34 @@ entry: } define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -4847,12 +21516,24 @@ entry: define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -4865,13 +21546,35 @@ entry: } define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -4888,11 +21591,35 @@ entry: define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -4903,11 +21630,35 @@ entry: } define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -4919,12 +21670,46 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -4938,12 +21723,46 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -4959,11 +21778,36 @@ entry: define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -4976,12 +21820,47 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -4998,11 +21877,34 @@ entry: define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -5013,11 +21915,34 @@ entry: } define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -5029,12 +21954,45 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -5048,12 +22006,45 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -5069,11 +22060,35 @@ entry: define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -5086,12 +22101,46 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -5108,11 +22157,39 @@ entry: define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi796: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi797: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi798: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -5123,11 +22200,39 @@ entry: } define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi799: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi800: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi801: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -5139,12 +22244,50 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi802: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi803: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi804: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -5158,12 +22301,50 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi805: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi806: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi807: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -5179,11 +22360,40 @@ entry: define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi808: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi809: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi810: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -5196,12 +22406,51 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi811: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi812: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi813: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -5218,11 +22467,46 @@ entry: define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi814: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi815: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi816: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -5233,11 +22517,46 @@ entry: } define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi817: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi818: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi819: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -5249,12 +22568,57 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi820: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi821: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi822: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -5268,12 +22632,57 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi823: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi824: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi825: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -5289,11 +22698,47 @@ entry: define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi826: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi827: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi828: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -5306,12 +22751,58 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi829: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi830: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi831: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -5328,12 +22819,53 @@ entry: define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5344,12 +22876,53 @@ entry: } define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5361,13 +22934,72 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5381,13 +23013,72 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5403,12 +23094,54 @@ entry: define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5421,13 +23154,73 @@ entry: } define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5444,12 +23237,52 @@ entry: define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5460,12 +23293,52 @@ entry: } define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5477,13 +23350,71 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5497,13 +23428,71 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5519,12 +23508,53 @@ entry: define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5537,13 +23567,72 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5560,12 +23649,41 @@ entry: define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi832: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi833: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi834: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5576,12 +23694,41 @@ entry: } define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi835: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi836: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi837: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5593,13 +23740,60 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi838: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi839: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi840: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5613,13 +23807,60 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi841: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi842: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi843: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5635,12 +23876,42 @@ entry: define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi844: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi845: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi846: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5653,13 +23924,61 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi847: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi848: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi849: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5676,12 +23995,48 @@ entry: define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi850: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi851: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi852: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5692,12 +24047,48 @@ entry: } define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi853: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi854: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi855: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5709,13 +24100,67 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi856: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi857: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi858: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -5729,13 +24174,67 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi859: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi860: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi861: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -5751,12 +24250,49 @@ entry: define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi862: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi863: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi864: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5769,13 +24305,68 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi865: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi866: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi867: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -5792,12 +24383,20 @@ entry: define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -5808,12 +24407,20 @@ entry: } define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -5825,13 +24432,22 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -5844,13 +24460,22 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -5865,12 +24490,20 @@ entry: define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -5883,13 +24516,22 @@ entry: } define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -5905,12 +24547,70 @@ entry: define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi868: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi869: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi870: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -5921,12 +24621,70 @@ entry: } define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi871: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi872: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi873: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -5938,13 +24696,72 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi874: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi875: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi876: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -5957,13 +24774,72 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi877: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi878: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi879: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -5978,12 +24854,70 @@ entry: define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi880: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi881: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi882: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -5996,13 +24930,72 @@ entry: } define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi883: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi884: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi885: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -6018,12 +25011,75 @@ entry: define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi886: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi887: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi888: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -6034,12 +25090,75 @@ entry: } define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi889: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi890: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi891: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -6051,13 +25170,77 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi892: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi893: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi894: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -6070,13 +25253,77 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi895: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi896: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi897: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -6091,12 +25338,75 @@ entry: define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi898: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi899: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi900: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -6109,13 +25419,77 @@ entry: } define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi901: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi902: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi903: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -6131,11 +25505,124 @@ entry: define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi904: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi905: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi906: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi907: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi908: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi909: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi910: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi911: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -6146,11 +25633,125 @@ entry: } define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltb (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltb (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi912: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi913: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi914: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi915: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi916: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi917: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi918: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi919: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -6162,12 +25763,126 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi920: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi921: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi922: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi923: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi924: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi925: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi926: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi927: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -6180,12 +25895,127 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi928: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi929: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi930: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi931: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi932: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi933: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi934: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi935: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -6200,11 +26030,129 @@ entry: define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi936: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi937: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi938: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi939: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi940: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi941: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi942: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi943: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -6215,11 +26163,130 @@ entry: } define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltb (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltb (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi944: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi945: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi946: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi947: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi948: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi949: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi950: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi951: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -6231,12 +26298,131 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi952: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi953: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi954: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi955: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi956: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi957: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi958: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi959: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -6249,12 +26435,132 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi960: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi961: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi962: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi963: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi964: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi965: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi966: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi967: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -6269,12 +26575,48 @@ entry: define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleb %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi968: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi969: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi970: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -6285,12 +26627,49 @@ entry: } define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltb (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltb (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi971: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi972: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi973: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -6302,13 +26681,58 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi974: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi975: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi976: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 +; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -6321,13 +26745,59 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltb (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltb (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi977: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi978: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi979: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm4 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -6342,11 +26812,26 @@ entry: define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -6357,11 +26842,27 @@ entry: } define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -6373,12 +26874,28 @@ entry: } define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -6391,12 +26908,29 @@ entry: } define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -6411,11 +26945,74 @@ entry: define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi980: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi981: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi982: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -6426,11 +27023,75 @@ entry: } define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi983: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi984: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi985: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -6442,12 +27103,76 @@ entry: } define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi986: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi987: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi988: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -6460,12 +27185,77 @@ entry: } define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi989: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi990: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi991: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -6480,11 +27270,79 @@ entry: define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi992: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi993: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi994: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -6495,11 +27353,80 @@ entry: } define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi995: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi996: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi997: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -6511,12 +27438,81 @@ entry: } define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi998: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi999: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1000: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -6529,12 +27525,82 @@ entry: } define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1001: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1002: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1003: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -6549,12 +27615,125 @@ entry: define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1004: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1005: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1006: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1007: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1008: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1009: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1010: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1011: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -6565,12 +27744,126 @@ entry: } define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1012: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1013: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1014: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1015: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1016: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1017: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1018: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1019: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -6582,13 +27875,127 @@ entry: } define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1020: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1021: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1022: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1023: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1024: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1025: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1026: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1027: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -6601,13 +28008,128 @@ entry: } define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1028: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1029: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1030: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1031: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1032: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1033: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1034: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1035: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -6622,12 +28144,130 @@ entry: define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1036: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1037: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1038: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1039: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1040: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1041: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1042: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1043: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -6638,12 +28278,131 @@ entry: } define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1044: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1045: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1046: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1047: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1048: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1049: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1050: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1051: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -6655,13 +28414,132 @@ entry: } define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1052: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1053: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1054: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1055: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1056: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1057: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1058: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1059: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -6674,13 +28552,133 @@ entry: } define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1060: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1061: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1062: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1063: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1064: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1065: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1066: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1067: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -6695,12 +28693,351 @@ entry: define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmplew %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1068: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1069: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1070: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm2 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -6711,12 +29048,268 @@ entry: } define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltw (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltw (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1071: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1072: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1073: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NoVLX-NEXT: vmovdqa (%rdi), %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -6728,13 +29321,361 @@ entry: } define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmplew %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1074: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1075: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1076: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 +; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} +; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm6, %xmm6 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm8, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 +; NoVLX-NEXT: vpxor %ymm5, %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3 +; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -6747,13 +29688,278 @@ entry: } define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltw (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltw (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1077: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1078: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1079: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm0, %xmm2 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; NoVLX-NEXT: vmovdqa (%rsi), %ymm5 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm5, %ymm3 +; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm5 +; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4 +; NoVLX-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 +; NoVLX-NEXT: vpxor %ymm5, %ymm3, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4 +; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4 +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -6768,11 +29974,53 @@ entry: define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -6783,11 +30031,54 @@ entry: } define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -6799,12 +30090,70 @@ entry: } define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -6818,12 +30167,71 @@ entry: } define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -6839,12 +30247,55 @@ entry: define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -6857,13 +30308,72 @@ entry: } define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -6880,11 +30390,52 @@ entry: define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -6895,11 +30446,53 @@ entry: } define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -6911,12 +30504,69 @@ entry: } define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -6930,12 +30580,70 @@ entry: } define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -6951,12 +30659,54 @@ entry: define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -6969,13 +30719,71 @@ entry: } define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -6992,11 +30800,41 @@ entry: define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1080: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1081: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1082: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -7007,11 +30845,42 @@ entry: } define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1083: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1084: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1085: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -7023,12 +30892,58 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1086: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1087: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1088: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -7042,12 +30957,59 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1089: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1090: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1091: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -7063,12 +31025,43 @@ entry: define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1092: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1093: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1094: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -7081,13 +31074,60 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1095: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1096: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1097: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -7104,11 +31144,48 @@ entry: define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1098: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1099: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1100: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -7119,11 +31196,49 @@ entry: } define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1101: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1102: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1103: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -7135,12 +31250,65 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1104: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1105: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1106: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -7154,12 +31322,66 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1107: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1108: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1109: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -7175,12 +31397,50 @@ entry: define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %xmm1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1110: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1111: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1112: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -7193,13 +31453,67 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1113: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1114: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1115: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -7216,21 +31530,23 @@ entry: define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def> -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -7242,21 +31558,23 @@ entry: } define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -7269,23 +31587,25 @@ entry: } define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def> -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -7299,23 +31619,25 @@ entry: } define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -7331,22 +31653,24 @@ entry: define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -7360,24 +31684,26 @@ entry: } define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -7394,12 +31720,72 @@ entry: define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1116: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1117: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1118: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -7410,12 +31796,72 @@ entry: } define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1119: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1120: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1121: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -7427,13 +31873,75 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1122: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1123: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1124: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -7446,13 +31954,75 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1125: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1126: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1127: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -7467,13 +32037,73 @@ entry: define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1128: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1129: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1130: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -7486,14 +32116,76 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1131: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1132: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1133: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -7509,12 +32201,77 @@ entry: define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1134: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1135: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1136: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -7525,12 +32282,77 @@ entry: } define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1137: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1138: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1139: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -7542,13 +32364,80 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1140: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1141: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1142: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -7561,13 +32450,80 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1143: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1144: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1145: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -7582,13 +32538,78 @@ entry: define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %ymm1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1146: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1147: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1148: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -7601,14 +32622,81 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %ymm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1149: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1150: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1151: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -7624,12 +32712,120 @@ entry: define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1152: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1153: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1154: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1155: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1156: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1157: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1158: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1159: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -7640,12 +32836,120 @@ entry: } define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1160: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1161: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1162: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1163: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1164: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1165: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1166: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1167: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -7657,13 +32961,122 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1168: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1169: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1170: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1171: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1172: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1173: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1174: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1175: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -7676,13 +33089,122 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1176: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1177: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1178: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1179: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1180: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1181: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1182: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1183: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -7697,13 +33219,122 @@ entry: define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm1 -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %zmm1 +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1184: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1185: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1186: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1187: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1188: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1189: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1190: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1191: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -7716,14 +33347,124 @@ entry: } define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %zmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %zmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1192: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1193: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1194: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1195: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1196: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1197: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1198: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1199: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -7739,12 +33480,125 @@ entry: define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1200: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1201: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1202: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1203: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1204: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1205: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1206: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1207: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -7755,12 +33609,125 @@ entry: } define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1208: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1209: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1210: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1211: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1212: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1213: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1214: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1215: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -7772,13 +33739,127 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1216: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1217: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1218: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1219: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1220: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1221: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1222: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1223: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -7791,13 +33872,127 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1224: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1225: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1226: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1227: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1228: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1229: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1230: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1231: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -7812,13 +34007,127 @@ entry: define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rdi), %zmm1 -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rdi), %zmm1 +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1232: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1233: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1234: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1235: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1236: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1237: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1238: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1239: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -7831,14 +34140,129 @@ entry: } define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd (%rsi), %zmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastd (%rsi), %zmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1240: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1241: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1242: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1243: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1244: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1245: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1246: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1247: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -7854,12 +34278,25 @@ entry: define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -7870,12 +34307,26 @@ entry: } define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -7887,13 +34338,34 @@ entry: } define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -7907,13 +34379,35 @@ entry: } define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -7929,13 +34423,27 @@ entry: define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -7948,14 +34456,36 @@ entry: } define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -7972,11 +34502,37 @@ entry: define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -7987,11 +34543,38 @@ entry: } define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8003,12 +34586,46 @@ entry: } define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -8022,12 +34639,47 @@ entry: } define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8043,12 +34695,39 @@ entry: define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8061,13 +34740,48 @@ entry: } define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8084,11 +34798,36 @@ entry: define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -8099,11 +34838,37 @@ entry: } define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8115,12 +34880,45 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -8134,12 +34932,46 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8155,12 +34987,38 @@ entry: define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8173,13 +35031,47 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8196,11 +35088,41 @@ entry: define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1248: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1249: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1250: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -8211,11 +35133,42 @@ entry: } define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1251: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1252: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1253: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8227,12 +35180,50 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1254: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1255: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1256: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -8246,12 +35237,51 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1257: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1258: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1259: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8267,12 +35297,43 @@ entry: define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1260: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1261: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1262: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8285,13 +35346,52 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1263: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1264: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1265: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8308,11 +35408,48 @@ entry: define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1266: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1267: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1268: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -8323,11 +35460,49 @@ entry: } define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1269: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1270: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1271: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8339,12 +35514,57 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1272: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1273: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1274: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -8358,12 +35578,58 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1275: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1276: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1277: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -8379,12 +35645,50 @@ entry: define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %xmm1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1278: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1279: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1280: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8397,13 +35701,59 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %xmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1281: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1282: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1283: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -8420,12 +35770,55 @@ entry: define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8436,12 +35829,56 @@ entry: } define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8453,13 +35890,74 @@ entry: } define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8473,13 +35971,75 @@ entry: } define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8495,13 +36055,57 @@ entry: define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8514,14 +36118,76 @@ entry: } define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8538,12 +36204,54 @@ entry: define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8554,12 +36262,55 @@ entry: } define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8571,13 +36322,73 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8591,13 +36402,74 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8613,13 +36485,56 @@ entry: define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8632,14 +36547,75 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8656,12 +36632,43 @@ entry: define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1284: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1285: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1286: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8672,12 +36679,44 @@ entry: } define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1287: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1288: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1289: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8689,13 +36728,62 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1290: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1291: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1292: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8709,13 +36797,63 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1293: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1294: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1295: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8731,13 +36869,45 @@ entry: define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1296: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1297: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1298: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8750,14 +36920,64 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1299: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1300: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1301: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8774,12 +36994,50 @@ entry: define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1302: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1303: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1304: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8790,12 +37048,51 @@ entry: } define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1305: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1306: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1307: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8807,13 +37104,69 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1308: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1309: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1310: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -8827,13 +37180,70 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1311: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1312: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1313: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -8849,13 +37259,52 @@ entry: define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %ymm1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1314: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1315: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1316: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8868,14 +37317,71 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %ymm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1317: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1318: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1319: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -8892,12 +37398,20 @@ entry: define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -8908,12 +37422,20 @@ entry: } define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -8925,13 +37447,22 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -8944,13 +37475,22 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -8965,13 +37505,22 @@ entry: define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %zmm1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -8984,14 +37533,24 @@ entry: } define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %zmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -9007,12 +37566,70 @@ entry: define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1320: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1321: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1322: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -9023,12 +37640,70 @@ entry: } define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1323: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1324: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1325: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -9040,13 +37715,72 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1326: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1327: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1328: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -9059,13 +37793,72 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1329: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1330: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1331: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -9080,13 +37873,72 @@ entry: define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %zmm1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1332: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1333: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1334: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -9099,14 +37951,74 @@ entry: } define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %zmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1335: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1336: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1337: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -9122,12 +38034,75 @@ entry: define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1338: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1339: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1340: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -9138,12 +38113,75 @@ entry: } define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1341: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1342: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1343: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -9155,13 +38193,77 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1344: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1345: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1346: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -9174,13 +38276,77 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1347: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1348: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1349: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -9195,13 +38361,77 @@ entry: define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rdi), %zmm1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rdi), %zmm1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1350: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1351: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1352: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -9214,14 +38444,79 @@ entry: } define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastq (%rsi), %zmm1 -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpbroadcastq (%rsi), %zmm1 +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1353: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1354: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1355: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -9237,11 +38532,125 @@ entry: define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1356: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1357: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1358: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1359: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1360: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1361: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1362: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1363: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -9252,11 +38661,125 @@ entry: } define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltub (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltub (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1364: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1365: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1366: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1367: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1368: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1369: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1370: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1371: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -9268,12 +38791,127 @@ entry: } define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1372: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1373: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1374: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1375: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1376: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1377: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1378: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1379: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -9286,12 +38924,127 @@ entry: } define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1380: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1381: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1382: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1383: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1384: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1385: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1386: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1387: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -9306,11 +39059,130 @@ entry: define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1388: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1389: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1390: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1391: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1392: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1393: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1394: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1395: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -9321,11 +39193,130 @@ entry: } define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltub (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltub (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1396: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1397: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1398: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1399: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1400: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1401: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1402: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1403: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -9337,12 +39328,132 @@ entry: } define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1404: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1405: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1406: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1407: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1408: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1409: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1410: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1411: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %1 = bitcast <2 x i64> %__b to <16 x i8> @@ -9355,12 +39466,132 @@ entry: } define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1412: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1413: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1414: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1415: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1416: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1417: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1418: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1419: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <16 x i8> %load = load <2 x i64>, <2 x i64>* %__b @@ -9375,12 +39606,49 @@ entry: define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltub %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1420: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1421: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1422: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -9391,12 +39659,49 @@ entry: } define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltub (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltub (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1423: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1424: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1425: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -9408,13 +39713,59 @@ entry: } define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1426: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1427: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1428: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm3, %xmm3 +; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %ymm5, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm5, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %1 = bitcast <4 x i64> %__b to <32 x i8> @@ -9427,13 +39778,59 @@ entry: } define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltub (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltub (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1429: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1430: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1431: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm1, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm2, %xmm2 +; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm4, %ymm4 +; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0 +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <32 x i8> %load = load <4 x i64>, <4 x i64>* %__b @@ -9448,11 +39845,27 @@ entry: define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -9463,11 +39876,27 @@ entry: } define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -9479,12 +39908,29 @@ entry: } define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -9497,12 +39943,29 @@ entry: } define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kunpckbw %k0, %k1, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -9517,11 +39980,75 @@ entry: define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1432: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1433: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1434: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -9532,11 +40059,75 @@ entry: } define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1435: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1436: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1437: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -9548,12 +40139,77 @@ entry: } define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1438: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1439: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1440: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -9566,12 +40222,77 @@ entry: } define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1441: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1442: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1443: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -9586,11 +40307,80 @@ entry: define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1444: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1445: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1446: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -9601,11 +40391,80 @@ entry: } define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1447: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1448: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1449: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -9617,12 +40476,82 @@ entry: } define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1450: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1451: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1452: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %1 = bitcast <2 x i64> %__b to <8 x i16> @@ -9635,12 +40564,82 @@ entry: } define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1453: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1454: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1455: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 +; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <8 x i16> %load = load <2 x i64>, <2 x i64>* %__b @@ -9655,12 +40654,126 @@ entry: define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1456: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1457: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1458: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1459: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1460: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1461: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1462: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1463: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -9671,12 +40784,126 @@ entry: } define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1464: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1465: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1466: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1467: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1468: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1469: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1470: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1471: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -9688,13 +40915,128 @@ entry: } define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1472: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1473: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1474: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1475: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1476: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1477: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1478: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1479: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -9707,13 +41049,128 @@ entry: } define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1480: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1481: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1482: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1483: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1484: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1485: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1486: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1487: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -9728,12 +41185,131 @@ entry: define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1488: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1489: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1490: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1491: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1492: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1493: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1494: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1495: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -9744,12 +41320,131 @@ entry: } define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1496: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1497: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1498: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1499: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1500: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1501: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1502: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1503: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -9761,13 +41456,133 @@ entry: } define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1504: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1505: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1506: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1507: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1508: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1509: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1510: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1511: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %1 = bitcast <4 x i64> %__b to <16 x i16> @@ -9780,13 +41595,133 @@ entry: } define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1512: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1513: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1514: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1515: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1516: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1517: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1518: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1519: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <16 x i16> %load = load <4 x i64>, <4 x i64>* %__b @@ -9801,12 +41736,353 @@ entry: define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1520: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1521: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1522: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3 +; NoVLX-NEXT: vmovq %xmm3, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm2, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm5 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm2, %ymm3, %ymm3 +; NoVLX-NEXT: vpxor %ymm2, %ymm4, %ymm4 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpmovsxbd %xmm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -9817,12 +42093,268 @@ entry: } define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuw (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuw (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1523: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1524: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1525: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm1 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm1 +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm4 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2 +; NoVLX-NEXT: vpxor 32(%rdi), %ymm1, %ymm3 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -9834,13 +42366,363 @@ entry: } define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1526: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1527: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1528: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2 +; NoVLX-NEXT: vmovq %xmm2, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8 +; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5 +; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm2, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; NoVLX-NEXT: vmovq %xmm3, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm9 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpextrq $1, %xmm3, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm6, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm4 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm6, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: vmovq %xmm7, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm7, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm7 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm8, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm8, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vmovq %xmm1, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm3 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm8 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4 +; NoVLX-NEXT: vpmovdb %zmm1, %xmm0 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; NoVLX-NEXT: vpmovdb %zmm1, %xmm7 +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm3 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm6 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm6, %ymm4, %ymm3 +; NoVLX-NEXT: vpxor %ymm6, %ymm2, %ymm2 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm4 +; NoVLX-NEXT: vpxor %ymm6, %ymm8, %ymm2 +; NoVLX-NEXT: vpxor %ymm6, %ymm5, %ymm3 +; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm2 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vpand %xmm7, %xmm2, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm0, %xmm4, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %1 = bitcast <8 x i64> %__b to <32 x i16> @@ -9853,13 +42735,278 @@ entry: } define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuw (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuw (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1529: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1530: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1531: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $96, %rsp +; NoVLX-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; NoVLX-NEXT: vmovq %xmm1, %rax +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: movq %rax, %rdx +; NoVLX-NEXT: vmovd %eax, %xmm2 +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm4 +; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm5 +; NoVLX-NEXT: shrq $32, %rdx +; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2 +; NoVLX-NEXT: vpextrq $1, %xmm1, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm5, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm5, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm4, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm5 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm4, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; NoVLX-NEXT: vmovq %xmm0, %rcx +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm6 +; NoVLX-NEXT: movl %ecx, %eax +; NoVLX-NEXT: shrl $16, %eax +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: movq %rcx, %rax +; NoVLX-NEXT: shrq $32, %rax +; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpextrq $1, %xmm0, %rax +; NoVLX-NEXT: shrq $48, %rcx +; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; NoVLX-NEXT: movl %eax, %ecx +; NoVLX-NEXT: shrl $16, %ecx +; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: movq %rax, %rcx +; NoVLX-NEXT: shrq $32, %rcx +; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm7 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 +; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; NoVLX-NEXT: vpmovdb %zmm0, %xmm2 +; NoVLX-NEXT: shrq $48, %rax +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm4 +; NoVLX-NEXT: vpinsrw $7, %eax, %xmm7, %xmm3 +; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; NoVLX-NEXT: vpxor %ymm5, %ymm3, %ymm3 +; NoVLX-NEXT: vpxor (%rsi), %ymm5, %ymm6 +; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm6, %ymm3 +; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3 +; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %eax, %xmm3 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4 +; NoVLX-NEXT: vpxor 32(%rsi), %ymm5, %ymm5 +; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4 +; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4 +; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4 +; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0 +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm4 +; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4 +; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2 +; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 +; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %ecx +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: shlq $32, %rax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <32 x i16> %load = load <8 x i64>, <8 x i64>* %__b @@ -9874,11 +43021,54 @@ entry: define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -9889,11 +43079,54 @@ entry: } define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -9905,12 +43138,73 @@ entry: } define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -9924,12 +43218,73 @@ entry: } define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -9945,11 +43300,55 @@ entry: define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -9962,12 +43361,74 @@ entry: } define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -9984,11 +43445,53 @@ entry: define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -9999,11 +43502,53 @@ entry: } define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -10015,12 +43560,72 @@ entry: } define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -10034,12 +43639,72 @@ entry: } define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -10055,11 +43720,54 @@ entry: define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -10072,12 +43780,73 @@ entry: } define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -10094,11 +43863,42 @@ entry: define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1532: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1533: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1534: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -10109,11 +43909,42 @@ entry: } define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1535: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1536: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1537: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -10125,12 +43956,61 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1538: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1539: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1540: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -10144,12 +44024,61 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1541: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1542: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1543: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -10165,11 +44094,43 @@ entry: define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1544: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1545: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1546: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -10182,12 +44143,62 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1547: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1548: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1549: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -10204,11 +44215,49 @@ entry: define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1550: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1551: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1552: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -10219,11 +44268,49 @@ entry: } define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1553: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1554: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1555: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -10235,12 +44322,68 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1556: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1557: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1558: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %1 = bitcast <2 x i64> %__b to <4 x i32> @@ -10254,12 +44397,68 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1559: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1560: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1561: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load <2 x i64>, <2 x i64>* %__b @@ -10275,11 +44474,50 @@ entry: define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1562: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1563: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1564: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -10292,12 +44530,69 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1565: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1566: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1567: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1 +; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x i32> %load = load i32, i32* %__b @@ -10314,21 +44609,23 @@ entry: define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def> -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -10340,21 +44637,23 @@ entry: } define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -10367,23 +44666,25 @@ entry: } define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def> -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -10397,23 +44698,25 @@ entry: } define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -10429,21 +44732,23 @@ entry: define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -10457,23 +44762,25 @@ entry: } define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> @@ -10490,12 +44797,72 @@ entry: define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1568: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1569: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1570: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -10506,12 +44873,72 @@ entry: } define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1571: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1572: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1573: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -10523,13 +44950,75 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1574: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1575: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1576: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -10542,13 +45031,75 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1577: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1578: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1579: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -10563,12 +45114,72 @@ entry: define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1580: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1581: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1582: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -10581,13 +45192,75 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1583: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1584: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1585: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -10603,12 +45276,77 @@ entry: define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1586: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1587: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1588: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -10619,12 +45357,77 @@ entry: } define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1589: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1590: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1591: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -10636,13 +45439,80 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1592: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1593: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1594: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %1 = bitcast <4 x i64> %__b to <8 x i32> @@ -10655,13 +45525,80 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1595: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1596: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1597: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovdqa (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k1, %k0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load <4 x i64>, <4 x i64>* %__b @@ -10676,12 +45613,77 @@ entry: define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1598: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1599: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1600: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -10694,13 +45696,80 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1601: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1602: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1603: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: kandw %k0, %k1, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x i32> %load = load i32, i32* %__b @@ -10716,12 +45785,120 @@ entry: define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1604: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1605: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1606: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1607: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1608: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1609: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1610: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1611: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -10732,12 +45909,120 @@ entry: } define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1612: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1613: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1614: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1615: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1616: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1617: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1618: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1619: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -10749,13 +46034,122 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1620: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1621: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1622: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1623: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1624: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1625: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1626: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1627: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -10768,13 +46162,122 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1628: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1629: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1630: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1631: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1632: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1633: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1634: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1635: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -10789,12 +46292,120 @@ entry: define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1636: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1637: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1638: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1639: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1640: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1641: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1642: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1643: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -10807,13 +46418,122 @@ entry: } define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1644: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1645: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1646: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1647: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1648: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1649: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1650: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1651: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -10829,12 +46549,125 @@ entry: define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1652: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1653: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1654: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1655: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1656: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1657: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1658: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1659: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -10845,12 +46678,125 @@ entry: } define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1660: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1661: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1662: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1663: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1664: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1665: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1666: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1667: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -10862,13 +46808,127 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1668: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1669: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1670: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1671: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1672: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1673: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1674: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1675: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %1 = bitcast <8 x i64> %__b to <16 x i32> @@ -10881,13 +46941,127 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1676: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1677: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1678: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1679: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1680: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1681: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1682: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1683: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load <8 x i64>, <8 x i64>* %__b @@ -10902,12 +47076,125 @@ entry: define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1684: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1685: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1686: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1687: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1688: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1689: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1690: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1691: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -10920,13 +47207,127 @@ entry: } define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1692: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1693: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1694: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1695: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1696: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1697: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1698: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1699: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x i32> %load = load i32, i32* %__b @@ -10942,12 +47343,26 @@ entry: define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -10958,12 +47373,26 @@ entry: } define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -10975,13 +47404,37 @@ entry: } define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -10995,13 +47448,37 @@ entry: } define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11017,12 +47494,27 @@ entry: define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11035,13 +47527,38 @@ entry: } define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11058,11 +47575,38 @@ entry: define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11073,11 +47617,38 @@ entry: } define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11089,12 +47660,49 @@ entry: } define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11108,12 +47716,49 @@ entry: } define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11129,11 +47774,39 @@ entry: define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11146,12 +47819,50 @@ entry: } define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11168,11 +47879,37 @@ entry: define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11183,11 +47920,37 @@ entry: } define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11199,12 +47962,48 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11218,12 +48017,48 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11239,11 +48074,38 @@ entry: define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11256,12 +48118,49 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11278,11 +48177,42 @@ entry: define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1700: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1701: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1702: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11293,11 +48223,42 @@ entry: } define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1703: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1704: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1705: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11309,12 +48270,53 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1706: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1707: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1708: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11328,12 +48330,53 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1709: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1710: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1711: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11349,11 +48392,43 @@ entry: define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1712: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1713: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1714: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11366,12 +48441,54 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1715: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1716: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1717: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11388,11 +48505,49 @@ entry: define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1718: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1719: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1720: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11403,11 +48558,49 @@ entry: } define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1721: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1722: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1723: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11419,12 +48612,60 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1724: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1725: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1726: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %1 = bitcast <2 x i64> %__b to <2 x i64> @@ -11438,12 +48679,60 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1727: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1728: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1729: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load <2 x i64>, <2 x i64>* %__b @@ -11459,11 +48748,50 @@ entry: define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1730: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1731: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1732: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11476,12 +48804,61 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1733: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1734: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1735: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1 +; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x i64> %load = load i64, i64* %__b @@ -11498,12 +48875,56 @@ entry: define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11514,12 +48935,56 @@ entry: } define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11531,13 +48996,75 @@ entry: } define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11551,13 +49078,75 @@ entry: } define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11573,12 +49162,57 @@ entry: define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11591,13 +49225,76 @@ entry: } define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11614,12 +49311,55 @@ entry: define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11630,12 +49370,55 @@ entry: } define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11647,13 +49430,74 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11667,13 +49511,74 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11689,12 +49594,56 @@ entry: define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11707,13 +49656,75 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11730,12 +49741,44 @@ entry: define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1736: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1737: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1738: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11746,12 +49789,44 @@ entry: } define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1739: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1740: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1741: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11763,13 +49838,63 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1742: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1743: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1744: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11783,13 +49908,63 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1745: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1746: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1747: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11805,12 +49980,45 @@ entry: define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1748: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1749: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1750: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11823,13 +50031,64 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1751: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1752: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1753: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11846,12 +50105,51 @@ entry: define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1754: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1755: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1756: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11862,12 +50160,51 @@ entry: } define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1757: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1758: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1759: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11879,13 +50216,70 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1760: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1761: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1762: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %1 = bitcast <4 x i64> %__b to <4 x i64> @@ -11899,13 +50293,70 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1763: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1764: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1765: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load <4 x i64>, <4 x i64>* %__b @@ -11921,12 +50372,52 @@ entry: define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1766: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1767: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1768: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11939,13 +50430,71 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1769: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1770: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1771: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1 +; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: kmovw %edi, %k0 +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kshiftlw $13, %k0, %k2 +; NoVLX-NEXT: kshiftrw $15, %k2, %k2 +; NoVLX-NEXT: kshiftlw $15, %k0, %k3 +; NoVLX-NEXT: kshiftrw $15, %k3, %k3 +; NoVLX-NEXT: kshiftlw $14, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: kmovw %k3, %ecx +; NoVLX-NEXT: vmovd %ecx, %xmm1 +; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k2, %eax +; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x i64> %load = load i64, i64* %__b @@ -11962,12 +50511,20 @@ entry: define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -11978,12 +50535,20 @@ entry: } define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -11995,13 +50560,22 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -12014,13 +50588,22 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -12035,12 +50618,20 @@ entry: define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -12053,13 +50644,22 @@ entry: } define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -12075,12 +50675,70 @@ entry: define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1772: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1773: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1774: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -12091,12 +50749,70 @@ entry: } define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1775: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1776: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1777: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -12108,13 +50824,72 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1778: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1779: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1780: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -12127,13 +50902,72 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1781: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1782: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1783: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -12148,12 +50982,70 @@ entry: define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1784: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1785: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1786: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -12166,13 +51058,72 @@ entry: } define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1787: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1788: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1789: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -12188,12 +51139,75 @@ entry: define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1790: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1791: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1792: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -12204,12 +51218,75 @@ entry: } define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1793: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1794: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1795: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -12221,13 +51298,77 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1796: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1797: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1798: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %1 = bitcast <8 x i64> %__b to <8 x i64> @@ -12240,13 +51381,77 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1799: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1800: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1801: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load <8 x i64>, <8 x i64>* %__b @@ -12261,12 +51466,75 @@ entry: define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1802: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1803: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1804: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -12279,13 +51547,77 @@ entry: } define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr { -; CHECK-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: kmovd %edi, %k1 +; VLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1805: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1806: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1807: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: kmovw %edi, %k1 +; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1} +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x i64> %load = load i64, i64* %__b @@ -12302,11 +51634,51 @@ entry: declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32) define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %1 = bitcast <2 x i64> %__b to <4 x float> @@ -12317,11 +51689,51 @@ entry: } define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load <2 x i64>, <2 x i64>* %__b @@ -12333,11 +51745,52 @@ entry: } define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1 +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load float, float* %__b @@ -12351,11 +51804,50 @@ entry: define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %1 = bitcast <2 x i64> %__b to <4 x float> @@ -12366,11 +51858,50 @@ entry: } define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load <2 x i64>, <2 x i64>* %__b @@ -12382,11 +51913,51 @@ entry: } define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1 +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load float, float* %__b @@ -12400,11 +51971,39 @@ entry: define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1808: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1809: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1810: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %1 = bitcast <2 x i64> %__b to <4 x float> @@ -12415,11 +52014,39 @@ entry: } define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1811: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1812: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1813: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load <2 x i64>, <2 x i64>* %__b @@ -12431,11 +52058,40 @@ entry: } define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1814: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1815: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1816: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1 +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load float, float* %__b @@ -12449,11 +52105,46 @@ entry: define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1817: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1818: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1819: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %1 = bitcast <2 x i64> %__b to <4 x float> @@ -12464,11 +52155,46 @@ entry: } define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1820: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1821: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1822: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load <2 x i64>, <2 x i64>* %__b @@ -12480,11 +52206,47 @@ entry: } define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1823: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1824: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1825: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1 +; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <4 x float> %load = load float, float* %__b @@ -12498,21 +52260,23 @@ entry: define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def> -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -12524,21 +52288,23 @@ entry: } define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vmovaps (%rdi), %ymm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -12551,21 +52317,23 @@ entry: } define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq ; ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b: -; NoVLX: ## BB#0: ## %entry -; NoVLX-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> ; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 ; NoVLX-NEXT: kshiftlw $8, %k0, %k0 ; NoVLX-NEXT: kshiftrw $8, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> @@ -12580,12 +52348,72 @@ entry: define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1826: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1827: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1828: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> %1 = bitcast <4 x i64> %__b to <8 x float> @@ -12596,12 +52424,72 @@ entry: } define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1829: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1830: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1831: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovaps (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> %load = load <4 x i64>, <4 x i64>* %__b @@ -12613,12 +52501,72 @@ entry: } define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1832: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1833: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1834: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> %load = load float, float* %__b @@ -12632,12 +52580,77 @@ entry: define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1835: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1836: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1837: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def> +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> %1 = bitcast <4 x i64> %__b to <8 x float> @@ -12648,12 +52661,77 @@ entry: } define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1838: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1839: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1840: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vmovaps (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> %load = load <4 x i64>, <4 x i64>* %__b @@ -12665,12 +52743,77 @@ entry: } define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1841: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1842: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1843: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def> +; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <8 x float> %load = load float, float* %__b @@ -12684,12 +52827,120 @@ entry: define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1844: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1845: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1846: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1847: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1848: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1849: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1850: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1851: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %1 = bitcast <8 x i64> %__b to <16 x float> @@ -12700,12 +52951,120 @@ entry: } define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1852: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1853: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1854: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1855: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1856: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1857: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1858: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1859: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %load = load <8 x i64>, <8 x i64>* %__b @@ -12717,12 +53076,120 @@ entry: } define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1860: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1861: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1862: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .Lcfi1863: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1864: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1865: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1866: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1867: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %load = load float, float* %__b @@ -12736,12 +53203,18 @@ entry: define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovw %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %1 = bitcast <8 x i64> %__b to <16 x float> @@ -12752,12 +53225,125 @@ entry: define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1868: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1869: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1870: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1871: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1872: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1873: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1874: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1875: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %1 = bitcast <8 x i64> %__b to <16 x float> @@ -12768,12 +53354,125 @@ entry: } define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1876: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1877: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1878: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1879: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1880: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1881: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1882: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1883: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %load = load <8 x i64>, <8 x i64>* %__b @@ -12785,12 +53484,125 @@ entry: } define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1884: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1885: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1886: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: .Lcfi1887: +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .Lcfi1888: +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .Lcfi1889: +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .Lcfi1890: +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .Lcfi1891: +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %load = load float, float* %__b @@ -12804,13 +53616,20 @@ entry: define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: movzwl %ax, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzwl %ax, %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <16 x float> %1 = bitcast <8 x i64> %__b to <16 x float> @@ -12822,12 +53641,23 @@ entry: declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32) define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %1 = bitcast <2 x i64> %__b to <2 x double> @@ -12838,12 +53668,23 @@ entry: } define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load <2 x i64>, <2 x i64>* %__b @@ -12855,12 +53696,24 @@ entry: } define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp) +; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load double, double* %__b @@ -12874,11 +53727,35 @@ entry: define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %1 = bitcast <2 x i64> %__b to <2 x double> @@ -12889,11 +53766,35 @@ entry: } define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load <2 x i64>, <2 x i64>* %__b @@ -12905,11 +53806,36 @@ entry: } define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load double, double* %__b @@ -12923,11 +53849,34 @@ entry: define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %1 = bitcast <2 x i64> %__b to <2 x double> @@ -12938,11 +53887,34 @@ entry: } define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load <2 x i64>, <2 x i64>* %__b @@ -12954,11 +53926,35 @@ entry: } define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load double, double* %__b @@ -12972,11 +53968,39 @@ entry: define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1892: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1893: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1894: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %1 = bitcast <2 x i64> %__b to <2 x double> @@ -12987,11 +54011,39 @@ entry: } define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1895: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1896: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1897: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load <2 x i64>, <2 x i64>* %__b @@ -13003,11 +54055,40 @@ entry: } define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1898: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1899: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1900: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load double, double* %__b @@ -13021,11 +54102,46 @@ entry: define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1901: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1902: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1903: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %1 = bitcast <2 x i64> %__b to <2 x double> @@ -13036,11 +54152,46 @@ entry: } define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1904: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1905: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1906: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load <2 x i64>, <2 x i64>* %__b @@ -13052,11 +54203,47 @@ entry: } define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1907: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1908: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1909: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <2 x i64> %__a to <2 x double> %load = load double, double* %__b @@ -13070,12 +54257,53 @@ entry: define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %1 = bitcast <4 x i64> %__b to <4 x double> @@ -13086,12 +54314,53 @@ entry: } define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load <4 x i64>, <4 x i64>* %__b @@ -13103,12 +54372,54 @@ entry: } define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kshiftlw $7, %k0, %k0 +; NoVLX-NEXT: kshiftrw $7, %k0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load double, double* %__b @@ -13122,12 +54433,52 @@ entry: define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %1 = bitcast <4 x i64> %__b to <4 x double> @@ -13138,12 +54489,52 @@ entry: } define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load <4 x i64>, <4 x i64>* %__b @@ -13155,12 +54546,53 @@ entry: } define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpextrb $4, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $0, %xmm0, %eax +; NoVLX-NEXT: andl $1, %eax +; NoVLX-NEXT: kmovw %eax, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kshiftrw $1, %k1, %k1 +; NoVLX-NEXT: kshiftlw $1, %k1, %k1 +; NoVLX-NEXT: korw %k0, %k1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $8, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3 +; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; NoVLX-NEXT: vpextrb $12, %xmm0, %eax +; NoVLX-NEXT: kmovw %eax, %k1 +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15] +; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load double, double* %__b @@ -13174,12 +54606,41 @@ entry: define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1910: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1911: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1912: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %1 = bitcast <4 x i64> %__b to <4 x double> @@ -13190,12 +54651,41 @@ entry: } define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1913: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1914: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1915: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load <4 x i64>, <4 x i64>* %__b @@ -13207,12 +54697,42 @@ entry: } define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1916: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1917: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1918: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load double, double* %__b @@ -13226,12 +54746,48 @@ entry: define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1919: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1920: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1921: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %1 = bitcast <4 x i64> %__b to <4 x double> @@ -13242,12 +54798,48 @@ entry: } define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1922: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1923: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1924: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load <4 x i64>, <4 x i64>* %__b @@ -13259,12 +54851,49 @@ entry: } define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1925: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1926: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1927: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1 +; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0 +; NoVLX-NEXT: vpmovqd %zmm0, %ymm0 +; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1 +; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 +; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 +; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <4 x i64> %__a to <4 x double> %load = load double, double* %__b @@ -13278,12 +54907,20 @@ entry: define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> @@ -13294,12 +54931,20 @@ entry: } define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %load = load <8 x i64>, <8 x i64>* %__b @@ -13311,12 +54956,20 @@ entry: } define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %load = load double, double* %__b @@ -13330,12 +54983,22 @@ entry: define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: movzbl %al, %eax +; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> @@ -13346,12 +55009,70 @@ entry: define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1928: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1929: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1930: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> @@ -13362,12 +55083,70 @@ entry: } define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1931: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1932: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1933: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %load = load <8 x i64>, <8 x i64>* %__b @@ -13379,12 +55158,70 @@ entry: } define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1934: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1935: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1936: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %load = load double, double* %__b @@ -13398,12 +55235,19 @@ entry: define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovb %k0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovb %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> @@ -13414,12 +55258,75 @@ entry: define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1937: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1938: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1939: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> @@ -13430,12 +55337,75 @@ entry: } define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1940: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1941: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1942: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %load = load <8 x i64>, <8 x i64>* %__b @@ -13447,12 +55417,75 @@ entry: } define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, %rax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; VLX-NEXT: kmovq %k0, %rax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .Lcfi1943: +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .Lcfi1944: +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .Lcfi1945: +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $64, %rsp +; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0 +; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftlw $8, %k0, %k0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; NoVLX-NEXT: shlq $32, %rcx +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: orq %rcx, %rax +; NoVLX-NEXT: movq %rbp, %rsp +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %load = load double, double* %__b @@ -13466,13 +55499,20 @@ entry: define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr { -; CHECK-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask: -; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask: +; VLX: # BB#0: # %entry +; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: movzbl %al, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask: +; NoVLX: # BB#0: # %entry +; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: movzbl %al, %eax +; NoVLX-NEXT: retq entry: %0 = bitcast <8 x i64> %__a to <8 x double> %1 = bitcast <8 x i64> %__b to <8 x double> diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll index 092b139fca2f9..1d78ee26a0b9b 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -1,48 +1,48 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+ssse3 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=AVX12,AVX1 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=AVX12,AVX2 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512 define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) { ; SSE2-LABEL: v8i16: -; SSE2: ## BB#0: +; SSE2: # BB#0: ; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i16: -; SSSE3: ## BB#0: +; SSSE3: # BB#0: ; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 ; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 ; SSSE3-NEXT: pand %xmm0, %xmm2 ; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: pmovmskb %xmm2, %eax -; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSSE3-NEXT: retq ; ; AVX12-LABEL: v8i16: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX12-NEXT: vpmovmskb %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k1 ; AVX512-NEXT: vpcmpgtw %xmm3, %xmm2, %k0 {%k1} ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX512-NEXT: retq %x0 = icmp sgt <8 x i16> %a, %b %x1 = icmp sgt <8 x i16> %c, %d @@ -53,25 +53,25 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) { define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; SSE2-SSSE3-LABEL: v4i32: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i32: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vmovmskps %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 ; AVX512-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1} ; AVX512-NEXT: kmovd %k0, %eax @@ -87,25 +87,25 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) { ; SSE2-SSSE3-LABEL: v4f32: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: cmpltps %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: andps %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: movmskps %xmm3, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4f32: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 ; AVX12-NEXT: vcmpltps %xmm2, %xmm3, %xmm1 ; AVX12-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vmovmskps %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4f32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %k1 ; AVX512-NEXT: vcmpltps %xmm2, %xmm3, %k0 {%k1} ; AVX512-NEXT: kmovd %k0, %eax @@ -121,29 +121,29 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; SSE2-SSSE3-LABEL: v16i8: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax -; SSE2-SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v16i8: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpmovmskb %xmm0, %eax -; AVX12-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 ; AVX512-NEXT: vpcmpgtb %xmm3, %xmm2, %k0 {%k1} ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; AVX512-NEXT: retq %x0 = icmp sgt <16 x i8> %a, %b %x1 = icmp sgt <16 x i8> %c, %d @@ -154,7 +154,7 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; SSE2-SSSE3-LABEL: v2i8: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: psllq $56, %xmm2 ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4 ; SSE2-SSSE3-NEXT: psrad $31, %xmm4 @@ -206,11 +206,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i8: -; AVX1: ## BB#0: +; AVX1: # BB#0: ; AVX1-NEXT: vpsllq $56, %xmm3, %xmm3 ; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4 ; AVX1-NEXT: vpsrad $24, %xmm3, %xmm3 @@ -235,11 +235,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i8: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vpsllq $56, %xmm3, %xmm3 ; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4 ; AVX2-NEXT: vpsrad $24, %xmm3, %xmm3 @@ -264,11 +264,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllq $56, %xmm3, %xmm3 ; AVX512-NEXT: vpsraq $56, %xmm3, %xmm3 ; AVX512-NEXT: vpsllq $56, %xmm2, %xmm2 @@ -292,7 +292,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; SSE2-SSSE3-LABEL: v2i16: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: psllq $48, %xmm2 ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4 ; SSE2-SSSE3-NEXT: psrad $31, %xmm4 @@ -344,11 +344,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i16: -; AVX1: ## BB#0: +; AVX1: # BB#0: ; AVX1-NEXT: vpsllq $48, %xmm3, %xmm3 ; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4 ; AVX1-NEXT: vpsrad $16, %xmm3, %xmm3 @@ -373,11 +373,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i16: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vpsllq $48, %xmm3, %xmm3 ; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4 ; AVX2-NEXT: vpsrad $16, %xmm3, %xmm3 @@ -402,11 +402,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllq $48, %xmm3, %xmm3 ; AVX512-NEXT: vpsraq $48, %xmm3, %xmm3 ; AVX512-NEXT: vpsllq $48, %xmm2, %xmm2 @@ -430,7 +430,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; SSE2-SSSE3-LABEL: v2i32: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: psllq $32, %xmm2 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] ; SSE2-SSSE3-NEXT: psrad $31, %xmm2 @@ -474,11 +474,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i32: -; AVX1: ## BB#0: +; AVX1: # BB#0: ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 ; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] @@ -499,11 +499,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i32: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vpsllq $32, %xmm3, %xmm3 ; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] @@ -524,11 +524,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllq $32, %xmm3, %xmm3 ; AVX512-NEXT: vpsraq $32, %xmm3, %xmm3 ; AVX512-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -552,7 +552,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; SSE2-SSSE3-LABEL: v2i64: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0] ; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0 @@ -576,20 +576,20 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v2i64: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vmovmskpd %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v2i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 ; AVX512-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1} ; AVX512-NEXT: kmovd %k0, %eax @@ -605,25 +605,25 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %d) { ; SSE2-SSSE3-LABEL: v2f64: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: cmpltpd %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: andpd %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: movmskpd %xmm3, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v2f64: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 ; AVX12-NEXT: vcmpltpd %xmm2, %xmm3, %xmm1 ; AVX12-NEXT: vandpd %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vmovmskpd %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v2f64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k1 ; AVX512-NEXT: vcmpltpd %xmm2, %xmm3, %k0 {%k1} ; AVX512-NEXT: kmovd %k0, %eax @@ -639,7 +639,7 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; SSE2-SSSE3-LABEL: v4i8: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: pslld $24, %xmm3 ; SSE2-SSSE3-NEXT: psrad $24, %xmm3 ; SSE2-SSSE3-NEXT: pslld $24, %xmm2 @@ -652,11 +652,11 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i8: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vpslld $24, %xmm3, %xmm3 ; AVX12-NEXT: vpsrad $24, %xmm3, %xmm3 ; AVX12-NEXT: vpslld $24, %xmm2, %xmm2 @@ -669,11 +669,11 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX12-NEXT: vmovmskps %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpslld $24, %xmm3, %xmm3 ; AVX512-NEXT: vpsrad $24, %xmm3, %xmm3 ; AVX512-NEXT: vpslld $24, %xmm2, %xmm2 @@ -697,7 +697,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { ; SSE2-SSSE3-LABEL: v4i16: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: pslld $16, %xmm3 ; SSE2-SSSE3-NEXT: psrad $16, %xmm3 ; SSE2-SSSE3-NEXT: pslld $16, %xmm2 @@ -710,11 +710,11 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i16: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vpslld $16, %xmm3, %xmm3 ; AVX12-NEXT: vpsrad $16, %xmm3, %xmm3 ; AVX12-NEXT: vpslld $16, %xmm2, %xmm2 @@ -727,11 +727,11 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX12-NEXT: vmovmskps %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpslld $16, %xmm3, %xmm3 ; AVX512-NEXT: vpsrad $16, %xmm3, %xmm3 ; AVX512-NEXT: vpslld $16, %xmm2, %xmm2 @@ -755,7 +755,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; SSE2-LABEL: v8i8: -; SSE2: ## BB#0: +; SSE2: # BB#0: ; SSE2-NEXT: psllw $8, %xmm3 ; SSE2-NEXT: psraw $8, %xmm3 ; SSE2-NEXT: psllw $8, %xmm2 @@ -770,11 +770,11 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i8: -; SSSE3: ## BB#0: +; SSSE3: # BB#0: ; SSSE3-NEXT: psllw $8, %xmm3 ; SSSE3-NEXT: psraw $8, %xmm3 ; SSSE3-NEXT: psllw $8, %xmm2 @@ -788,11 +788,11 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; SSSE3-NEXT: pand %xmm2, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSSE3-NEXT: retq ; ; AVX12-LABEL: v8i8: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vpsllw $8, %xmm3, %xmm3 ; AVX12-NEXT: vpsraw $8, %xmm3, %xmm3 ; AVX12-NEXT: vpsllw $8, %xmm2, %xmm2 @@ -806,11 +806,11 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX12-NEXT: vpmovmskb %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllw $8, %xmm3, %xmm3 ; AVX512-NEXT: vpsraw $8, %xmm3, %xmm3 ; AVX512-NEXT: vpsllw $8, %xmm2, %xmm2 @@ -822,7 +822,7 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k1 ; AVX512-NEXT: vpcmpgtw %xmm3, %xmm2, %k0 {%k1} ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX512-NEXT: retq %x0 = icmp sgt <8 x i8> %a, %b %x1 = icmp sgt <8 x i8> %c, %d diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll index a6d6ca155302e..95529686a58af 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-256.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+SSE2 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+SSSE3 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=AVX12,AVX1 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=AVX12,AVX2 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSE2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSSE3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX512 define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; SSE2-SSSE3-LABEL: v4i64: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0] ; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm1 @@ -57,11 +57,11 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; SSE2-SSSE3-NEXT: psrad $31, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v4i64: -; AVX1: ## BB#0: +; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 @@ -74,12 +74,12 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovmskps %xmm0, %eax -; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: v4i64: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 @@ -88,12 +88,12 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovmskps %xmm0, %eax -; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: v4i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 ; AVX512-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1} ; AVX512-NEXT: kmovd %k0, %eax @@ -110,7 +110,7 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { ; SSE2-SSSE3-LABEL: v4f64: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: cmpltpd %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] @@ -123,11 +123,11 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> ; SSE2-SSSE3-NEXT: psrad $31, %xmm6 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm6 ; SSE2-SSSE3-NEXT: movmskps %xmm6, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4f64: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 @@ -136,12 +136,12 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> ; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vmovmskps %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: vzeroupper ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4f64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k1 ; AVX512-NEXT: vcmpltpd %ymm2, %ymm3, %k0 {%k1} ; AVX512-NEXT: kmovd %k0, %eax @@ -158,7 +158,7 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { ; SSE2-LABEL: v16i16: -; SSE2: ## BB#0: +; SSE2: # BB#0: ; SSE2-NEXT: pcmpgtw %xmm3, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm3, %xmm1 @@ -181,11 +181,11 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { ; SSE2-NEXT: pcmpgtb %xmm4, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; SSE2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v16i16: -; SSSE3: ## BB#0: +; SSSE3: # BB#0: ; SSSE3-NEXT: pcmpgtw %xmm3, %xmm1 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; SSSE3-NEXT: pshufb %xmm3, %xmm1 @@ -208,11 +208,11 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { ; SSSE3-NEXT: pcmpgtb %xmm4, %xmm2 ; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: pmovmskb %xmm2, %eax -; SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; SSSE3-NEXT: retq ; ; AVX1-LABEL: v16i16: -; AVX1: ## BB#0: +; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4 @@ -225,12 +225,12 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { ; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i16: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 @@ -239,16 +239,16 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k1 ; AVX512-NEXT: vpcmpgtw %ymm3, %ymm2, %k0 {%k1} ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x0 = icmp sgt <16 x i16> %a, %b @@ -260,7 +260,7 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { ; SSE2-LABEL: v8i32: -; SSE2: ## BB#0: +; SSE2: # BB#0: ; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] @@ -287,11 +287,11 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i32: -; SSSE3: ## BB#0: +; SSSE3: # BB#0: ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSSE3-NEXT: pshufb %xmm3, %xmm1 @@ -310,11 +310,11 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { ; SSSE3-NEXT: pand %xmm0, %xmm4 ; SSSE3-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: pmovmskb %xmm4, %eax -; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSSE3-NEXT: retq ; ; AVX1-LABEL: v8i32: -; AVX1: ## BB#0: +; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 @@ -328,12 +328,12 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i32: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 @@ -343,16 +343,16 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: v8i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 ; AVX512-NEXT: vpcmpgtd %ymm3, %ymm2, %k0 {%k1} ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x0 = icmp sgt <8 x i32> %a, %b @@ -364,7 +364,7 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) { ; SSE2-LABEL: v8f32: -; SSE2: ## BB#0: +; SSE2: # BB#0: ; SSE2-NEXT: cmpltps %xmm1, %xmm3 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] @@ -391,11 +391,11 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8f32: -; SSSE3: ## BB#0: +; SSSE3: # BB#0: ; SSSE3-NEXT: cmpltps %xmm1, %xmm3 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSSE3-NEXT: pshufb %xmm1, %xmm3 @@ -414,11 +414,11 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) ; SSSE3-NEXT: pand %xmm2, %xmm6 ; SSSE3-NEXT: pshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: pmovmskb %xmm6, %eax -; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSSE3-NEXT: retq ; ; AVX12-LABEL: v8f32: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 @@ -428,16 +428,16 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX12-NEXT: vpmovmskb %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: vzeroupper ; AVX12-NEXT: retq ; ; AVX512-LABEL: v8f32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k1 ; AVX512-NEXT: vcmpltps %ymm2, %ymm3, %k0 {%k1} ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x0 = fcmp ogt <8 x float> %a, %b @@ -449,7 +449,7 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { ; SSE2-SSSE3-LABEL: v32i8: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: pcmpgtb %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm1 ; SSE2-SSSE3-NEXT: pcmpgtb %xmm6, %xmm4 @@ -561,14 +561,14 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v32i8: -; AVX1: ## BB#0: +; AVX1: # BB#0: ; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: Lcfi0: +; AVX1-NEXT: .Lcfi0: ; AVX1-NEXT: .cfi_def_cfa_offset 16 -; AVX1-NEXT: Lcfi1: +; AVX1-NEXT: .Lcfi1: ; AVX1-NEXT: .cfi_offset %rbp, -16 ; AVX1-NEXT: movq %rsp, %rbp -; AVX1-NEXT: Lcfi2: +; AVX1-NEXT: .Lcfi2: ; AVX1-NEXT: .cfi_def_cfa_register %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $32, %rsp @@ -687,7 +687,7 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { ; AVX1-NEXT: retq ; ; AVX2-LABEL: v32i8: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -696,7 +696,7 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { ; AVX2-NEXT: retq ; ; AVX512-LABEL: v32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %k1 ; AVX512-NEXT: vpcmpgtb %ymm3, %ymm2, %k0 {%k1} ; AVX512-NEXT: kmovd %k0, %eax diff --git a/test/CodeGen/X86/bitcast-and-setcc-512.ll b/test/CodeGen/X86/bitcast-and-setcc-512.ll new file mode 100644 index 0000000000000..2eba79b0297f9 --- /dev/null +++ b/test/CodeGen/X86/bitcast-and-setcc-512.ll @@ -0,0 +1,1868 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW + +define i8 @v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) { +; SSE-LABEL: v8i64: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: pcmpgtq %xmm7, %xmm3 +; SSE-NEXT: pcmpgtq %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pslld $31, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE-NEXT: pshufb %xmm3, %xmm2 +; SSE-NEXT: pcmpgtq %xmm5, %xmm1 +; SSE-NEXT: pcmpgtq %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: psrad $31, %xmm0 +; SSE-NEXT: pshufb %xmm3, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: psraw $15, %xmm0 +; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm11[0,2] +; SSE-NEXT: pslld $31, %xmm9 +; SSE-NEXT: psrad $31, %xmm9 +; SSE-NEXT: pshufb %xmm3, %xmm9 +; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm10[0,2] +; SSE-NEXT: pslld $31, %xmm8 +; SSE-NEXT: psrad $31, %xmm8 +; SSE-NEXT: pshufb %xmm3, %xmm8 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0] +; SSE-NEXT: psllw $15, %xmm8 +; SSE-NEXT: psraw $15, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pshufb {{.*#+}} xmm8 = xmm8[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE-NEXT: pmovmskb %xmm8, %eax +; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; SSE-NEXT: retq +; +; AVX1-LABEL: v8i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm9[0] +; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm3 +; AVX1-NEXT: vpacksswb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX1-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: v8i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpacksswb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm7, %ymm5, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpacksswb %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v8i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vpcmpgtq %zmm3, %zmm2, %k0 {%k1} +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v8i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 +; AVX512BW-NEXT: vpcmpgtq %zmm3, %zmm2, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %x0 = icmp sgt <8 x i64> %a, %b + %x1 = icmp sgt <8 x i64> %c, %d + %y = and <8 x i1> %x0, %x1 + %res = bitcast <8 x i1> %y to i8 + ret i8 %res +} + +define i8 @v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d) { +; SSE-LABEL: v8f64: +; SSE: # BB#0: +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: cmpltpd %xmm3, %xmm7 +; SSE-NEXT: cmpltpd %xmm2, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] +; SSE-NEXT: pslld $31, %xmm6 +; SSE-NEXT: psrad $31, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE-NEXT: pshufb %xmm2, %xmm6 +; SSE-NEXT: cmpltpd %xmm1, %xmm5 +; SSE-NEXT: cmpltpd %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2] +; SSE-NEXT: pslld $31, %xmm4 +; SSE-NEXT: psrad $31, %xmm4 +; SSE-NEXT: pshufb %xmm2, %xmm4 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: psllw $15, %xmm4 +; SSE-NEXT: psraw $15, %xmm4 +; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm11[0,2] +; SSE-NEXT: pslld $31, %xmm9 +; SSE-NEXT: psrad $31, %xmm9 +; SSE-NEXT: pshufb %xmm2, %xmm9 +; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm10[0,2] +; SSE-NEXT: pslld $31, %xmm8 +; SSE-NEXT: psrad $31, %xmm8 +; SSE-NEXT: pshufb %xmm2, %xmm8 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0] +; SSE-NEXT: psllw $15, %xmm8 +; SSE-NEXT: psraw $15, %xmm8 +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: pshufb {{.*#+}} xmm8 = xmm8[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE-NEXT: pmovmskb %xmm8, %eax +; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; SSE-NEXT: retq +; +; AVX12-LABEL: v8f64: +; AVX12: # BB#0: +; AVX12-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1 +; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX12-NEXT: vpacksswb %xmm3, %xmm1, %xmm1 +; AVX12-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX12-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX12-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0 +; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX12-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX12-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX12-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX12-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX12-NEXT: vcmpltpd %ymm5, %ymm7, %ymm1 +; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX12-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX12-NEXT: vcmpltpd %ymm4, %ymm6, %ymm2 +; AVX12-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX12-NEXT: vpacksswb %xmm4, %xmm2, %xmm2 +; AVX12-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX12-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX12-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: vzeroupper +; AVX12-NEXT: retq +; +; AVX512F-LABEL: v8f64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k1 +; AVX512F-NEXT: vcmpltpd %zmm2, %zmm3, %k0 {%k1} +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v8f64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1 +; AVX512BW-NEXT: vcmpltpd %zmm2, %zmm3, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %x0 = fcmp ogt <8 x double> %a, %b + %x1 = fcmp ogt <8 x double> %c, %d + %y = and <8 x i1> %x0, %x1 + %res = bitcast <8 x i1> %y to i8 + ret i8 %res +} + +define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) { +; SSE-LABEL: v32i16: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: pcmpgtw %xmm5, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSE-NEXT: pshufb %xmm5, %xmm1 +; SSE-NEXT: pcmpgtw %xmm4, %xmm0 +; SSE-NEXT: pshufb %xmm5, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: pcmpgtw %xmm7, %xmm3 +; SSE-NEXT: pshufb %xmm5, %xmm3 +; SSE-NEXT: pcmpgtw %xmm6, %xmm2 +; SSE-NEXT: pshufb %xmm5, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: pshufb %xmm5, %xmm11 +; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: pshufb %xmm5, %xmm8 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm11[0] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: pshufb %xmm5, %xmm10 +; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufb %xmm5, %xmm9 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE-NEXT: pand %xmm2, %xmm9 +; SSE-NEXT: pextrb $15, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $14, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $13, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $12, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $11, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $10, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $9, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $8, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $7, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $6, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $5, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $4, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $3, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $2, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $1, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $0, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $15, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $14, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $13, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $12, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $11, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $10, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $9, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $8, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $7, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $6, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $5, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $4, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $3, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $2, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $1, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $0, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; SSE-NEXT: shll $16, %ecx +; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: orl %ecx, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: v32i16: +; AVX1: # BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .Lcfi0: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: .Lcfi1: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: .Lcfi2: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $32, %rsp +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-NEXT: vpcmpgtw %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtw %xmm7, %xmm5, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 +; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtw %xmm6, %xmm4, %xmm3 +; AVX1-NEXT: vpacksswb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrb $15, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $14, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $13, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $11, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $9, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $7, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $5, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $3, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $1, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: movl (%rsp), %eax +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: v32i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpacksswb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtw %ymm7, %ymm5, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtw %ymm6, %ymm4, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v32i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: .Lcfi0: +; AVX512F-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-NEXT: .Lcfi1: +; AVX512F-NEXT: .cfi_offset %rbp, -16 +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: .Lcfi2: +; AVX512F-NEXT: .cfi_def_cfa_register %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $32, %rsp +; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm1 +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm5, %ymm1 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm1 +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpgtw %ymm6, %ymm4, %ymm2 +; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm2 +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, (%rsp) +; AVX512F-NEXT: movl (%rsp), %eax +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v32i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 +; AVX512BW-NEXT: vpcmpgtw %zmm3, %zmm2, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %x0 = icmp sgt <32 x i16> %a, %b + %x1 = icmp sgt <32 x i16> %c, %d + %y = and <32 x i1> %x0, %x1 + %res = bitcast <32 x i1> %y to i32 + ret i32 %res +} + +define i16 @v16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) { +; SSE-LABEL: v16i32: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE-NEXT: pshufb %xmm7, %xmm3 +; SSE-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE-NEXT: pshufb %xmm7, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: psllw $15, %xmm2 +; SSE-NEXT: psraw $15, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSE-NEXT: pshufb %xmm3, %xmm2 +; SSE-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE-NEXT: pshufb %xmm7, %xmm1 +; SSE-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE-NEXT: pshufb %xmm7, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: psraw $15, %xmm0 +; SSE-NEXT: pshufb %xmm3, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pcmpgtb %xmm0, %xmm4 +; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: pshufb %xmm7, %xmm11 +; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufb %xmm7, %xmm9 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0] +; SSE-NEXT: psllw $15, %xmm9 +; SSE-NEXT: psraw $15, %xmm9 +; SSE-NEXT: pshufb %xmm3, %xmm9 +; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: pshufb %xmm7, %xmm10 +; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: pshufb %xmm7, %xmm8 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; SSE-NEXT: psllw $15, %xmm8 +; SSE-NEXT: psraw $15, %xmm8 +; SSE-NEXT: pshufb %xmm3, %xmm8 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0] +; SSE-NEXT: psllw $7, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pcmpgtb %xmm8, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pmovmskb %xmm1, %eax +; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; SSE-NEXT: retq +; +; AVX1-LABEL: v16i32: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-NEXT: vpcmpgtd %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm9[0] +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vpand %xmm9, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm7, %xmm5, %xmm3 +; AVX1-NEXT: vpacksswb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm9, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpacksswb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpcmpgtd %ymm7, %ymm5, %ymm5 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-NEXT: vpacksswb %xmm7, %xmm5, %xmm5 +; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-NEXT: vpacksswb %xmm6, %xmm4, %xmm4 +; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; AVX2-NEXT: vpsllw $7, %xmm3, %xmm3 +; AVX2-NEXT: vpand %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v16i32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; AVX512F-NEXT: vpcmpgtd %zmm3, %zmm2, %k0 {%k1} +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v16i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; AVX512BW-NEXT: vpcmpgtd %zmm3, %zmm2, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %x0 = icmp sgt <16 x i32> %a, %b + %x1 = icmp sgt <16 x i32> %c, %d + %y = and <16 x i1> %x0, %x1 + %res = bitcast <16 x i1> %y to i16 + ret i16 %res +} + +define i16 @v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d) { +; SSE-LABEL: v16f32: +; SSE: # BB#0: +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: cmpltps %xmm3, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE-NEXT: pshufb %xmm3, %xmm7 +; SSE-NEXT: cmpltps %xmm2, %xmm6 +; SSE-NEXT: pshufb %xmm3, %xmm6 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: psllw $15, %xmm6 +; SSE-NEXT: psraw $15, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSE-NEXT: pshufb %xmm2, %xmm6 +; SSE-NEXT: cmpltps %xmm1, %xmm5 +; SSE-NEXT: pshufb %xmm3, %xmm5 +; SSE-NEXT: cmpltps %xmm0, %xmm4 +; SSE-NEXT: pshufb %xmm3, %xmm4 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: psllw $15, %xmm4 +; SSE-NEXT: psraw $15, %xmm4 +; SSE-NEXT: pshufb %xmm2, %xmm4 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: psllw $7, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pcmpgtb %xmm4, %xmm5 +; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: pshufb %xmm3, %xmm11 +; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pshufb %xmm3, %xmm9 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0] +; SSE-NEXT: psllw $15, %xmm9 +; SSE-NEXT: psraw $15, %xmm9 +; SSE-NEXT: pshufb %xmm2, %xmm9 +; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: pshufb %xmm3, %xmm10 +; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: pshufb %xmm3, %xmm8 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; SSE-NEXT: psllw $15, %xmm8 +; SSE-NEXT: psraw $15, %xmm8 +; SSE-NEXT: pshufb %xmm2, %xmm8 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0] +; SSE-NEXT: psllw $7, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: pcmpgtb %xmm8, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; SSE-NEXT: retq +; +; AVX12-LABEL: v16f32: +; AVX12: # BB#0: +; AVX12-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 +; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX12-NEXT: vpacksswb %xmm3, %xmm1, %xmm1 +; AVX12-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX12-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX12-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX12-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX12-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX12-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX12-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX12-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX12-NEXT: vcmpltps %ymm5, %ymm7, %ymm5 +; AVX12-NEXT: vextractf128 $1, %ymm5, %xmm7 +; AVX12-NEXT: vpacksswb %xmm7, %xmm5, %xmm5 +; AVX12-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX12-NEXT: vcmpltps %ymm4, %ymm6, %ymm4 +; AVX12-NEXT: vextractf128 $1, %ymm4, %xmm6 +; AVX12-NEXT: vpacksswb %xmm6, %xmm4, %xmm4 +; AVX12-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; AVX12-NEXT: vpsllw $7, %xmm3, %xmm3 +; AVX12-NEXT: vpand %xmm1, %xmm3, %xmm1 +; AVX12-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX12-NEXT: vzeroupper +; AVX12-NEXT: retq +; +; AVX512F-LABEL: v16f32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1 +; AVX512F-NEXT: vcmpltps %zmm2, %zmm3, %k0 {%k1} +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v16f32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k1 +; AVX512BW-NEXT: vcmpltps %zmm2, %zmm3, %k0 {%k1} +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %x0 = fcmp ogt <16 x float> %a, %b + %x1 = fcmp ogt <16 x float> %c, %d + %y = and <16 x i1> %x0, %x1 + %res = bitcast <16 x i1> %y to i16 + ret i16 %res +} + +define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) { +; SSE-LABEL: v64i8: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: pcmpgtb %xmm6, %xmm2 +; SSE-NEXT: pcmpgtb %xmm7, %xmm3 +; SSE-NEXT: pcmpgtb %xmm4, %xmm0 +; SSE-NEXT: pcmpgtb %xmm5, %xmm1 +; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: pextrb $15, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $14, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $13, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $12, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $11, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $10, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $9, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $8, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $7, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $6, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $5, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $4, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $3, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $2, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $1, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $0, %xmm11, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $15, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $14, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $13, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $12, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $11, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $10, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $9, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $8, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $7, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $6, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $5, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $4, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $3, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $2, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $1, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $0, %xmm10, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $15, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $14, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $13, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $12, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $11, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $10, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $9, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $8, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $7, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $6, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $5, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $4, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $3, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $2, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $1, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $0, %xmm9, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $15, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $14, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $13, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $12, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $11, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $10, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $9, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $8, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $7, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $6, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $5, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $4, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $3, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $2, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $1, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $0, %xmm8, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx +; SSE-NEXT: orl %eax, %ecx +; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; SSE-NEXT: shll $16, %edx +; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: orl %edx, %eax +; SSE-NEXT: shlq $32, %rax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: v64i8: +; AVX1: # BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .Lcfi3: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: .Lcfi4: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: .Lcfi5: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-NEXT: vpcmpgtb %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm8 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpcmpgtb %xmm7, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm8, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 +; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtb %xmm6, %xmm4, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpextrb $15, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $14, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $13, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $11, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $9, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $7, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $5, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $3, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $1, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $15, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $14, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $13, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $11, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $9, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $7, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $5, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $3, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $1, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrb $15, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $14, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $13, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $12, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $11, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $10, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $9, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $8, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $7, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $6, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $5, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $4, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $3, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $2, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $1, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $0, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: movl (%rsp), %ecx +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: shlq $32, %rax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: v64i8: +; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: .Lcfi0: +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: .Lcfi1: +; AVX2-NEXT: .cfi_offset %rbp, -16 +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: .Lcfi2: +; AVX2-NEXT: .cfi_def_cfa_register %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpcmpgtb %ymm7, %ymm5, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpcmpgtb %ymm6, %ymm4, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrb $15, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $14, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $13, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $12, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $11, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $10, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $9, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $8, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $7, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $6, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $5, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $4, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $3, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $2, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $1, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $0, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $15, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $14, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $13, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $12, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $11, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $10, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $9, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $8, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $7, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $6, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $5, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $4, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $3, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $2, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $1, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $0, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrb $15, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $14, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $13, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $12, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $11, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $10, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $9, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $8, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $7, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $6, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $5, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $4, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $3, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $2, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $1, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $0, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $13, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $11, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $9, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $7, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $5, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movl (%rsp), %ecx +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: shlq $32, %rax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v64i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: .Lcfi3: +; AVX512F-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-NEXT: .Lcfi4: +; AVX512F-NEXT: .cfi_offset %rbp, -16 +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: .Lcfi5: +; AVX512F-NEXT: .cfi_def_cfa_register %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $64, %rsp +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpcmpgtb %ymm7, %ymm5, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpgtb %ymm6, %ymm4, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, (%rsp) +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl (%rsp), %ecx +; AVX512F-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX512F-NEXT: shlq $32, %rax +; AVX512F-NEXT: orq %rcx, %rax +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v64i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 +; AVX512BW-NEXT: vpcmpgtb %zmm3, %zmm2, %k0 {%k1} +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %x0 = icmp sgt <64 x i8> %a, %b + %x1 = icmp sgt <64 x i8> %c, %d + %y = and <64 x i1> %x0, %x1 + %res = bitcast <64 x i1> %y to i64 + ret i64 %res +} diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll new file mode 100644 index 0000000000000..9b6401d1a76c9 --- /dev/null +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -0,0 +1,3483 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512 + +; +; 128-bit vectors +; + +define <2 x i64> @ext_i2_2i64(i2 %a0) { +; SSE2-SSSE3-LABEL: ext_i2_2i64: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: andb $3, %dil +; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movq %rax, %rcx +; SSE2-SSSE3-NEXT: shlq $62, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movq %rcx, %xmm1 +; SSE2-SSSE3-NEXT: shlq $63, %rax +; SSE2-SSSE3-NEXT: sarq $63, %rax +; SSE2-SSSE3-NEXT: movq %rax, %xmm0 +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: retq +; +; AVX12-LABEL: ext_i2_2i64: +; AVX12: # BB#0: +; AVX12-NEXT: andb $3, %dil +; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $62, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vmovq %rcx, %xmm0 +; AVX12-NEXT: shlq $63, %rax +; AVX12-NEXT: sarq $63, %rax +; AVX12-NEXT: vmovq %rax, %xmm1 +; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX12-NEXT: retq +; +; AVX512-LABEL: ext_i2_2i64: +; AVX512: # BB#0: +; AVX512-NEXT: andb $3, %dil +; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = bitcast i2 %a0 to <2 x i1> + %2 = sext <2 x i1> %1 to <2 x i64> + ret <2 x i64> %2 +} + +define <4 x i32> @ext_i4_4i32(i4 %a0) { +; SSE2-SSSE3-LABEL: ext_i4_4i32: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: andb $15, %dil +; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movq %rax, %rcx +; SSE2-SSSE3-NEXT: shlq $60, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movq %rax, %rcx +; SSE2-SSSE3-NEXT: shlq $61, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-SSSE3-NEXT: movq %rax, %rcx +; SSE2-SSSE3-NEXT: shlq $62, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: shlq $63, %rax +; SSE2-SSSE3-NEXT: sarq $63, %rax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: retq +; +; AVX12-LABEL: ext_i4_4i32: +; AVX12: # BB#0: +; AVX12-NEXT: andb $15, %dil +; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $62, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: movq %rax, %rdx +; AVX12-NEXT: shlq $63, %rdx +; AVX12-NEXT: sarq $63, %rdx +; AVX12-NEXT: vmovd %edx, %xmm0 +; AVX12-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $61, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: shlq $60, %rax +; AVX12-NEXT: sarq $63, %rax +; AVX12-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX12-NEXT: retq +; +; AVX512-LABEL: ext_i4_4i32: +; AVX512: # BB#0: +; AVX512-NEXT: andb $15, %dil +; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = bitcast i4 %a0 to <4 x i1> + %2 = sext <4 x i1> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @ext_i8_8i16(i8 %a0) { +; SSE2-SSSE3-LABEL: ext_i8_8i16: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax +; SSE2-SSSE3-NEXT: movq %rax, %rcx +; SSE2-SSSE3-NEXT: shrq $7, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movq %rax, %rcx +; SSE2-SSSE3-NEXT: shlq $57, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-SSSE3-NEXT: movq %rax, %rcx +; SSE2-SSSE3-NEXT: shlq $58, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movq %rax, %rcx +; SSE2-SSSE3-NEXT: shlq $59, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-SSSE3-NEXT: movq %rax, %rcx +; SSE2-SSSE3-NEXT: shlq $60, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movq %rax, %rcx +; SSE2-SSSE3-NEXT: shlq $61, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-SSSE3-NEXT: movq %rax, %rcx +; SSE2-SSSE3-NEXT: shlq $62, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: shlq $63, %rax +; SSE2-SSSE3-NEXT: sarq $63, %rax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: retq +; +; AVX12-LABEL: ext_i8_8i16: +; AVX12: # BB#0: +; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX12-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $62, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: movq %rax, %rdx +; AVX12-NEXT: shlq $63, %rdx +; AVX12-NEXT: sarq $63, %rdx +; AVX12-NEXT: vmovd %edx, %xmm0 +; AVX12-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $61, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $60, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $59, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $58, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $57, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: shrq $7, %rax +; AVX12-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; AVX12-NEXT: retq +; +; AVX512-LABEL: ext_i8_8i16: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k0 +; AVX512-NEXT: vpmovm2w %k0, %xmm0 +; AVX512-NEXT: retq + %1 = bitcast i8 %a0 to <8 x i1> + %2 = sext <8 x i1> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @ext_i16_16i8(i16 %a0) { +; SSE2-SSSE3-LABEL: ext_i16_16i8: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: pushq %rbp +; SSE2-SSSE3-NEXT: .Lcfi0: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 16 +; SSE2-SSSE3-NEXT: pushq %r15 +; SSE2-SSSE3-NEXT: .Lcfi1: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 24 +; SSE2-SSSE3-NEXT: pushq %r14 +; SSE2-SSSE3-NEXT: .Lcfi2: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 32 +; SSE2-SSSE3-NEXT: pushq %r13 +; SSE2-SSSE3-NEXT: .Lcfi3: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 40 +; SSE2-SSSE3-NEXT: pushq %r12 +; SSE2-SSSE3-NEXT: .Lcfi4: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 48 +; SSE2-SSSE3-NEXT: pushq %rbx +; SSE2-SSSE3-NEXT: .Lcfi5: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 56 +; SSE2-SSSE3-NEXT: .Lcfi6: +; SSE2-SSSE3-NEXT: .cfi_offset %rbx, -56 +; SSE2-SSSE3-NEXT: .Lcfi7: +; SSE2-SSSE3-NEXT: .cfi_offset %r12, -48 +; SSE2-SSSE3-NEXT: .Lcfi8: +; SSE2-SSSE3-NEXT: .cfi_offset %r13, -40 +; SSE2-SSSE3-NEXT: .Lcfi9: +; SSE2-SSSE3-NEXT: .cfi_offset %r14, -32 +; SSE2-SSSE3-NEXT: .Lcfi10: +; SSE2-SSSE3-NEXT: .cfi_offset %r15, -24 +; SSE2-SSSE3-NEXT: .Lcfi11: +; SSE2-SSSE3-NEXT: .cfi_offset %rbp, -16 +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rax +; SSE2-SSSE3-NEXT: movq %rax, %r8 +; SSE2-SSSE3-NEXT: movq %rax, %r9 +; SSE2-SSSE3-NEXT: movq %rax, %r10 +; SSE2-SSSE3-NEXT: movq %rax, %r11 +; SSE2-SSSE3-NEXT: movq %rax, %r14 +; SSE2-SSSE3-NEXT: movq %rax, %r15 +; SSE2-SSSE3-NEXT: movq %rax, %r12 +; SSE2-SSSE3-NEXT: movq %rax, %r13 +; SSE2-SSSE3-NEXT: movq %rax, %rbx +; SSE2-SSSE3-NEXT: movq %rax, %rcx +; SSE2-SSSE3-NEXT: movq %rax, %rdx +; SSE2-SSSE3-NEXT: movq %rax, %rsi +; SSE2-SSSE3-NEXT: movq %rax, %rdi +; SSE2-SSSE3-NEXT: movq %rax, %rbp +; SSE2-SSSE3-NEXT: shrq $15, %rbp +; SSE2-SSSE3-NEXT: movd %ebp, %xmm0 +; SSE2-SSSE3-NEXT: movq %rax, %rbp +; SSE2-SSSE3-NEXT: movsbq %al, %rax +; SSE2-SSSE3-NEXT: shlq $49, %r8 +; SSE2-SSSE3-NEXT: sarq $63, %r8 +; SSE2-SSSE3-NEXT: movd %r8d, %xmm1 +; SSE2-SSSE3-NEXT: shlq $50, %r9 +; SSE2-SSSE3-NEXT: sarq $63, %r9 +; SSE2-SSSE3-NEXT: movd %r9d, %xmm2 +; SSE2-SSSE3-NEXT: shlq $51, %r10 +; SSE2-SSSE3-NEXT: sarq $63, %r10 +; SSE2-SSSE3-NEXT: movd %r10d, %xmm3 +; SSE2-SSSE3-NEXT: shlq $52, %r11 +; SSE2-SSSE3-NEXT: sarq $63, %r11 +; SSE2-SSSE3-NEXT: movd %r11d, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: shlq $53, %r14 +; SSE2-SSSE3-NEXT: sarq $63, %r14 +; SSE2-SSSE3-NEXT: movd %r14d, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-SSSE3-NEXT: shlq $54, %r15 +; SSE2-SSSE3-NEXT: sarq $63, %r15 +; SSE2-SSSE3-NEXT: movd %r15d, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-SSSE3-NEXT: shlq $55, %r12 +; SSE2-SSSE3-NEXT: sarq $63, %r12 +; SSE2-SSSE3-NEXT: movd %r12d, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-SSSE3-NEXT: shlq $60, %r13 +; SSE2-SSSE3-NEXT: sarq $63, %r13 +; SSE2-SSSE3-NEXT: movd %r13d, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-SSSE3-NEXT: shlq $61, %rbx +; SSE2-SSSE3-NEXT: sarq $63, %rbx +; SSE2-SSSE3-NEXT: movd %ebx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-SSSE3-NEXT: shlq $62, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm5 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-SSSE3-NEXT: shlq $63, %rdx +; SSE2-SSSE3-NEXT: sarq $63, %rdx +; SSE2-SSSE3-NEXT: movd %edx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-SSSE3-NEXT: shlq $58, %rsi +; SSE2-SSSE3-NEXT: sarq $63, %rsi +; SSE2-SSSE3-NEXT: movd %esi, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE2-SSSE3-NEXT: shlq $59, %rdi +; SSE2-SSSE3-NEXT: sarq $63, %rdi +; SSE2-SSSE3-NEXT: movd %edi, %xmm4 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-SSSE3-NEXT: shlq $57, %rbp +; SSE2-SSSE3-NEXT: sarq $63, %rbp +; SSE2-SSSE3-NEXT: movd %ebp, %xmm2 +; SSE2-SSSE3-NEXT: shrq $7, %rax +; SSE2-SSSE3-NEXT: movd %eax, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: popq %rbx +; SSE2-SSSE3-NEXT: popq %r12 +; SSE2-SSSE3-NEXT: popq %r13 +; SSE2-SSSE3-NEXT: popq %r14 +; SSE2-SSSE3-NEXT: popq %r15 +; SSE2-SSSE3-NEXT: popq %rbp +; SSE2-SSSE3-NEXT: retq +; +; AVX12-LABEL: ext_i16_16i8: +; AVX12: # BB#0: +; AVX12-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; AVX12-NEXT: movswq -{{[0-9]+}}(%rsp), %rax +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $62, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: movq %rax, %rdx +; AVX12-NEXT: shlq $63, %rdx +; AVX12-NEXT: sarq $63, %rdx +; AVX12-NEXT: vmovd %edx, %xmm0 +; AVX12-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $61, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $60, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $59, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $58, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $57, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movsbq %al, %rcx +; AVX12-NEXT: shrq $7, %rcx +; AVX12-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $55, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $54, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $53, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $52, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $51, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $50, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movq %rax, %rcx +; AVX12-NEXT: shlq $49, %rcx +; AVX12-NEXT: sarq $63, %rcx +; AVX12-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: shrq $15, %rax +; AVX12-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX12-NEXT: retq +; +; AVX512-LABEL: ext_i16_16i8: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k0 +; AVX512-NEXT: vpmovm2b %k0, %xmm0 +; AVX512-NEXT: retq + %1 = bitcast i16 %a0 to <16 x i1> + %2 = sext <16 x i1> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; 256-bit vectors +; + +define <4 x i64> @ext_i4_4i64(i4 %a0) { +; SSE2-SSSE3-LABEL: ext_i4_4i64: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: andb $15, %dil +; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-SSSE3-NEXT: movd %eax, %xmm2 +; SSE2-SSSE3-NEXT: shrl %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; SSE2-SSSE3-NEXT: psllq $63, %xmm0 +; SSE2-SSSE3-NEXT: psrad $31, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] +; SSE2-SSSE3-NEXT: psllq $63, %xmm1 +; SSE2-SSSE3-NEXT: psrad $31, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i4_4i64: +; AVX1: # BB#0: +; AVX1-NEXT: andb $15, %dil +; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $60, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $61, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $62, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: shlq $63, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i4_4i64: +; AVX2: # BB#0: +; AVX2-NEXT: andb $15, %dil +; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $60, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $61, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $62, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: shlq $63, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i4_4i64: +; AVX512: # BB#0: +; AVX512-NEXT: andb $15, %dil +; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512-NEXT: retq + %1 = bitcast i4 %a0 to <4 x i1> + %2 = sext <4 x i1> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define <8 x i32> @ext_i8_8i32(i8 %a0) { +; SSE2-SSSE3-LABEL: ext_i8_8i32: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: shrl $7, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm3 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-SSSE3-NEXT: pslld $31, %xmm0 +; SSE2-SSSE3-NEXT: psrad $31, %xmm0 +; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: pslld $31, %xmm1 +; SSE2-SSSE3-NEXT: psrad $31, %xmm1 +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i8_8i32: +; AVX1: # BB#0: +; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $58, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: shlq $59, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $57, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shrq $7, %rcx +; AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $62, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: shlq $63, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: vmovd %edx, %xmm1 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $61, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $60, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i8_8i32: +; AVX2: # BB#0: +; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $58, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: shlq $59, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $57, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq $7, %rcx +; AVX2-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $62, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: shlq $63, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: vmovd %edx, %xmm1 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $61, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $60, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i8_8i32: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: retq + %1 = bitcast i8 %a0 to <8 x i1> + %2 = sext <8 x i1> %1 to <8 x i32> + ret <8 x i32> %2 +} + +define <16 x i16> @ext_i16_16i16(i16 %a0) { +; SSE2-SSSE3-LABEL: ext_i16_16i16: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-SSSE3-NEXT: psllw $15, %xmm0 +; SSE2-SSSE3-NEXT: psraw $15, %xmm0 +; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE2-SSSE3-NEXT: psllw $15, %xmm1 +; SSE2-SSSE3-NEXT: psraw $15, %xmm1 +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i16_16i16: +; AVX1: # BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .Lcfi0: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: .Lcfi1: +; AVX1-NEXT: .cfi_def_cfa_offset 24 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: .Lcfi2: +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: .Lcfi3: +; AVX1-NEXT: .cfi_def_cfa_offset 40 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: .Lcfi4: +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: .Lcfi5: +; AVX1-NEXT: .cfi_def_cfa_offset 56 +; AVX1-NEXT: .Lcfi6: +; AVX1-NEXT: .cfi_offset %rbx, -56 +; AVX1-NEXT: .Lcfi7: +; AVX1-NEXT: .cfi_offset %r12, -48 +; AVX1-NEXT: .Lcfi8: +; AVX1-NEXT: .cfi_offset %r13, -40 +; AVX1-NEXT: .Lcfi9: +; AVX1-NEXT: .cfi_offset %r14, -32 +; AVX1-NEXT: .Lcfi10: +; AVX1-NEXT: .cfi_offset %r15, -24 +; AVX1-NEXT: .Lcfi11: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movswq -{{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: shlq $55, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: movq %rax, %r8 +; AVX1-NEXT: movq %rax, %r10 +; AVX1-NEXT: movq %rax, %r11 +; AVX1-NEXT: movq %rax, %r14 +; AVX1-NEXT: movq %rax, %r15 +; AVX1-NEXT: movq %rax, %r9 +; AVX1-NEXT: movq %rax, %r12 +; AVX1-NEXT: movq %rax, %r13 +; AVX1-NEXT: movq %rax, %rbx +; AVX1-NEXT: movq %rax, %rdi +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: movq %rax, %rdx +; AVX1-NEXT: movq %rax, %rsi +; AVX1-NEXT: movsbq %al, %rbp +; AVX1-NEXT: shlq $54, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: shlq $53, %r8 +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0 +; AVX1-NEXT: shlq $52, %r10 +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: vpinsrw $3, %r10d, %xmm0, %xmm0 +; AVX1-NEXT: shlq $51, %r11 +; AVX1-NEXT: sarq $63, %r11 +; AVX1-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 +; AVX1-NEXT: shlq $50, %r14 +; AVX1-NEXT: sarq $63, %r14 +; AVX1-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0 +; AVX1-NEXT: shlq $49, %r15 +; AVX1-NEXT: sarq $63, %r15 +; AVX1-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0 +; AVX1-NEXT: shrq $15, %r9 +; AVX1-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 +; AVX1-NEXT: shlq $63, %r13 +; AVX1-NEXT: sarq $63, %r13 +; AVX1-NEXT: vmovd %r13d, %xmm1 +; AVX1-NEXT: shlq $62, %r12 +; AVX1-NEXT: sarq $63, %r12 +; AVX1-NEXT: vpinsrw $1, %r12d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $61, %rbx +; AVX1-NEXT: sarq $63, %rbx +; AVX1-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $60, %rdi +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1 +; AVX1-NEXT: shlq $59, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $58, %rdx +; AVX1-NEXT: sarq $63, %rdx +; AVX1-NEXT: vpinsrw $5, %edx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $57, %rsi +; AVX1-NEXT: sarq $63, %rsi +; AVX1-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1 +; AVX1-NEXT: shrq $7, %rbp +; AVX1-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i16_16i16: +; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: .Lcfi0: +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: .Lcfi1: +; AVX2-NEXT: .cfi_def_cfa_offset 24 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: .Lcfi2: +; AVX2-NEXT: .cfi_def_cfa_offset 32 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: .Lcfi3: +; AVX2-NEXT: .cfi_def_cfa_offset 40 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: .Lcfi4: +; AVX2-NEXT: .cfi_def_cfa_offset 48 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: .Lcfi5: +; AVX2-NEXT: .cfi_def_cfa_offset 56 +; AVX2-NEXT: .Lcfi6: +; AVX2-NEXT: .cfi_offset %rbx, -56 +; AVX2-NEXT: .Lcfi7: +; AVX2-NEXT: .cfi_offset %r12, -48 +; AVX2-NEXT: .Lcfi8: +; AVX2-NEXT: .cfi_offset %r13, -40 +; AVX2-NEXT: .Lcfi9: +; AVX2-NEXT: .cfi_offset %r14, -32 +; AVX2-NEXT: .Lcfi10: +; AVX2-NEXT: .cfi_offset %r15, -24 +; AVX2-NEXT: .Lcfi11: +; AVX2-NEXT: .cfi_offset %rbp, -16 +; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movswq -{{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shlq $55, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: movq %rax, %r11 +; AVX2-NEXT: movq %rax, %r14 +; AVX2-NEXT: movq %rax, %r15 +; AVX2-NEXT: movq %rax, %r9 +; AVX2-NEXT: movq %rax, %r12 +; AVX2-NEXT: movq %rax, %r13 +; AVX2-NEXT: movq %rax, %rbx +; AVX2-NEXT: movq %rax, %rdi +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: movq %rax, %rdx +; AVX2-NEXT: movq %rax, %rsi +; AVX2-NEXT: movsbq %al, %rbp +; AVX2-NEXT: shlq $54, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: shlq $53, %r8 +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0 +; AVX2-NEXT: shlq $52, %r10 +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: vpinsrw $3, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: shlq $51, %r11 +; AVX2-NEXT: sarq $63, %r11 +; AVX2-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0 +; AVX2-NEXT: shlq $50, %r14 +; AVX2-NEXT: sarq $63, %r14 +; AVX2-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0 +; AVX2-NEXT: shlq $49, %r15 +; AVX2-NEXT: sarq $63, %r15 +; AVX2-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0 +; AVX2-NEXT: shrq $15, %r9 +; AVX2-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: shlq $63, %r13 +; AVX2-NEXT: sarq $63, %r13 +; AVX2-NEXT: vmovd %r13d, %xmm1 +; AVX2-NEXT: shlq $62, %r12 +; AVX2-NEXT: sarq $63, %r12 +; AVX2-NEXT: vpinsrw $1, %r12d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $61, %rbx +; AVX2-NEXT: sarq $63, %rbx +; AVX2-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $60, %rdi +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1 +; AVX2-NEXT: shlq $59, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $58, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: vpinsrw $5, %edx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $57, %rsi +; AVX2-NEXT: sarq $63, %rsi +; AVX2-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1 +; AVX2-NEXT: shrq $7, %rbp +; AVX2-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i16_16i16: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k0 +; AVX512-NEXT: vpmovm2w %k0, %ymm0 +; AVX512-NEXT: retq + %1 = bitcast i16 %a0 to <16 x i1> + %2 = sext <16 x i1> %1 to <16 x i16> + ret <16 x i16> %2 +} + +define <32 x i8> @ext_i32_32i8(i32 %a0) { +; SSE2-SSSE3-LABEL: ext_i32_32i8: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: pushq %rbp +; SSE2-SSSE3-NEXT: .Lcfi12: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 16 +; SSE2-SSSE3-NEXT: pushq %r15 +; SSE2-SSSE3-NEXT: .Lcfi13: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 24 +; SSE2-SSSE3-NEXT: pushq %r14 +; SSE2-SSSE3-NEXT: .Lcfi14: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 32 +; SSE2-SSSE3-NEXT: pushq %r13 +; SSE2-SSSE3-NEXT: .Lcfi15: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 40 +; SSE2-SSSE3-NEXT: pushq %r12 +; SSE2-SSSE3-NEXT: .Lcfi16: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 48 +; SSE2-SSSE3-NEXT: pushq %rbx +; SSE2-SSSE3-NEXT: .Lcfi17: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 56 +; SSE2-SSSE3-NEXT: .Lcfi18: +; SSE2-SSSE3-NEXT: .cfi_offset %rbx, -56 +; SSE2-SSSE3-NEXT: .Lcfi19: +; SSE2-SSSE3-NEXT: .cfi_offset %r12, -48 +; SSE2-SSSE3-NEXT: .Lcfi20: +; SSE2-SSSE3-NEXT: .cfi_offset %r13, -40 +; SSE2-SSSE3-NEXT: .Lcfi21: +; SSE2-SSSE3-NEXT: .cfi_offset %r14, -32 +; SSE2-SSSE3-NEXT: .Lcfi22: +; SSE2-SSSE3-NEXT: .cfi_offset %r15, -24 +; SSE2-SSSE3-NEXT: .Lcfi23: +; SSE2-SSSE3-NEXT: .cfi_offset %rbp, -16 +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: shrl $16, %edi +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rbx +; SSE2-SSSE3-NEXT: movq %rbx, %r8 +; SSE2-SSSE3-NEXT: movq %rbx, %r9 +; SSE2-SSSE3-NEXT: movq %rbx, %r10 +; SSE2-SSSE3-NEXT: movq %rbx, %r11 +; SSE2-SSSE3-NEXT: movq %rbx, %r14 +; SSE2-SSSE3-NEXT: movq %rbx, %r15 +; SSE2-SSSE3-NEXT: movq %rbx, %r12 +; SSE2-SSSE3-NEXT: movq %rbx, %r13 +; SSE2-SSSE3-NEXT: movq %rbx, %rdi +; SSE2-SSSE3-NEXT: movq %rbx, %rcx +; SSE2-SSSE3-NEXT: movq %rbx, %rdx +; SSE2-SSSE3-NEXT: movq %rbx, %rbp +; SSE2-SSSE3-NEXT: movq %rbx, %rsi +; SSE2-SSSE3-NEXT: movq %rbx, %rax +; SSE2-SSSE3-NEXT: shrq $15, %rax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: movq %rbx, %rax +; SSE2-SSSE3-NEXT: movsbq %bl, %rbx +; SSE2-SSSE3-NEXT: shlq $49, %r8 +; SSE2-SSSE3-NEXT: sarq $63, %r8 +; SSE2-SSSE3-NEXT: movd %r8d, %xmm15 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-SSSE3-NEXT: shlq $50, %r9 +; SSE2-SSSE3-NEXT: sarq $63, %r9 +; SSE2-SSSE3-NEXT: movd %r9d, %xmm8 +; SSE2-SSSE3-NEXT: shlq $51, %r10 +; SSE2-SSSE3-NEXT: sarq $63, %r10 +; SSE2-SSSE3-NEXT: movd %r10d, %xmm3 +; SSE2-SSSE3-NEXT: shlq $52, %r11 +; SSE2-SSSE3-NEXT: sarq $63, %r11 +; SSE2-SSSE3-NEXT: movd %r11d, %xmm9 +; SSE2-SSSE3-NEXT: shlq $53, %r14 +; SSE2-SSSE3-NEXT: sarq $63, %r14 +; SSE2-SSSE3-NEXT: movd %r14d, %xmm6 +; SSE2-SSSE3-NEXT: shlq $54, %r15 +; SSE2-SSSE3-NEXT: sarq $63, %r15 +; SSE2-SSSE3-NEXT: movd %r15d, %xmm10 +; SSE2-SSSE3-NEXT: shlq $55, %r12 +; SSE2-SSSE3-NEXT: sarq $63, %r12 +; SSE2-SSSE3-NEXT: movd %r12d, %xmm1 +; SSE2-SSSE3-NEXT: shlq $60, %r13 +; SSE2-SSSE3-NEXT: sarq $63, %r13 +; SSE2-SSSE3-NEXT: movd %r13d, %xmm11 +; SSE2-SSSE3-NEXT: shlq $61, %rdi +; SSE2-SSSE3-NEXT: sarq $63, %rdi +; SSE2-SSSE3-NEXT: movd %edi, %xmm5 +; SSE2-SSSE3-NEXT: shlq $62, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm12 +; SSE2-SSSE3-NEXT: shlq $63, %rdx +; SSE2-SSSE3-NEXT: sarq $63, %rdx +; SSE2-SSSE3-NEXT: movd %edx, %xmm0 +; SSE2-SSSE3-NEXT: shlq $58, %rbp +; SSE2-SSSE3-NEXT: sarq $63, %rbp +; SSE2-SSSE3-NEXT: movd %ebp, %xmm13 +; SSE2-SSSE3-NEXT: shlq $59, %rsi +; SSE2-SSSE3-NEXT: sarq $63, %rsi +; SSE2-SSSE3-NEXT: movd %esi, %xmm7 +; SSE2-SSSE3-NEXT: shlq $57, %rax +; SSE2-SSSE3-NEXT: sarq $63, %rax +; SSE2-SSSE3-NEXT: movd %eax, %xmm4 +; SSE2-SSSE3-NEXT: shrq $7, %rbx +; SSE2-SSSE3-NEXT: movd %ebx, %xmm14 +; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi +; SSE2-SSSE3-NEXT: movq %rsi, %r8 +; SSE2-SSSE3-NEXT: movq %rsi, %r9 +; SSE2-SSSE3-NEXT: movq %rsi, %r10 +; SSE2-SSSE3-NEXT: movq %rsi, %r11 +; SSE2-SSSE3-NEXT: movq %rsi, %r14 +; SSE2-SSSE3-NEXT: movq %rsi, %r15 +; SSE2-SSSE3-NEXT: movq %rsi, %r12 +; SSE2-SSSE3-NEXT: movq %rsi, %r13 +; SSE2-SSSE3-NEXT: movq %rsi, %rbx +; SSE2-SSSE3-NEXT: movq %rsi, %rax +; SSE2-SSSE3-NEXT: movq %rsi, %rcx +; SSE2-SSSE3-NEXT: movq %rsi, %rdx +; SSE2-SSSE3-NEXT: movq %rsi, %rdi +; SSE2-SSSE3-NEXT: movq %rsi, %rbp +; SSE2-SSSE3-NEXT: shrq $15, %rbp +; SSE2-SSSE3-NEXT: movd %ebp, %xmm2 +; SSE2-SSSE3-NEXT: movq %rsi, %rbp +; SSE2-SSSE3-NEXT: movsbq %sil, %rsi +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE2-SSSE3-NEXT: shlq $49, %r8 +; SSE2-SSSE3-NEXT: sarq $63, %r8 +; SSE2-SSSE3-NEXT: movd %r8d, %xmm3 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-SSSE3-NEXT: shlq $50, %r9 +; SSE2-SSSE3-NEXT: sarq $63, %r9 +; SSE2-SSSE3-NEXT: movd %r9d, %xmm4 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE2-SSSE3-NEXT: shlq $51, %r10 +; SSE2-SSSE3-NEXT: sarq $63, %r10 +; SSE2-SSSE3-NEXT: movd %r10d, %xmm5 +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: shlq $52, %r11 +; SSE2-SSSE3-NEXT: sarq $63, %r11 +; SSE2-SSSE3-NEXT: movd %r11d, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-SSSE3-NEXT: shlq $53, %r14 +; SSE2-SSSE3-NEXT: sarq $63, %r14 +; SSE2-SSSE3-NEXT: movd %r14d, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-SSSE3-NEXT: shlq $54, %r15 +; SSE2-SSSE3-NEXT: sarq $63, %r15 +; SSE2-SSSE3-NEXT: movd %r15d, %xmm4 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE2-SSSE3-NEXT: shlq $55, %r12 +; SSE2-SSSE3-NEXT: sarq $63, %r12 +; SSE2-SSSE3-NEXT: movd %r12d, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-SSSE3-NEXT: shlq $60, %r13 +; SSE2-SSSE3-NEXT: sarq $63, %r13 +; SSE2-SSSE3-NEXT: movd %r13d, %xmm6 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-SSSE3-NEXT: shlq $61, %rbx +; SSE2-SSSE3-NEXT: sarq $63, %rbx +; SSE2-SSSE3-NEXT: movd %ebx, %xmm4 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-SSSE3-NEXT: shlq $62, %rax +; SSE2-SSSE3-NEXT: sarq $63, %rax +; SSE2-SSSE3-NEXT: movd %eax, %xmm2 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE2-SSSE3-NEXT: shlq $63, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE2-SSSE3-NEXT: shlq $58, %rdx +; SSE2-SSSE3-NEXT: sarq $63, %rdx +; SSE2-SSSE3-NEXT: movd %edx, %xmm5 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-SSSE3-NEXT: shlq $59, %rdi +; SSE2-SSSE3-NEXT: sarq $63, %rdi +; SSE2-SSSE3-NEXT: movd %edi, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE2-SSSE3-NEXT: shlq $57, %rbp +; SSE2-SSSE3-NEXT: sarq $63, %rbp +; SSE2-SSSE3-NEXT: movd %ebp, %xmm4 +; SSE2-SSSE3-NEXT: shrq $7, %rsi +; SSE2-SSSE3-NEXT: movd %esi, %xmm5 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-SSSE3-NEXT: popq %rbx +; SSE2-SSSE3-NEXT: popq %r12 +; SSE2-SSSE3-NEXT: popq %r13 +; SSE2-SSSE3-NEXT: popq %r14 +; SSE2-SSSE3-NEXT: popq %r15 +; SSE2-SSSE3-NEXT: popq %rbp +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i32_32i8: +; AVX1: # BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .Lcfi12: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: .Lcfi13: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: .Lcfi14: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: .Lcfi15: +; AVX1-NEXT: .cfi_offset %rbx, -56 +; AVX1-NEXT: .Lcfi16: +; AVX1-NEXT: .cfi_offset %r12, -48 +; AVX1-NEXT: .Lcfi17: +; AVX1-NEXT: .cfi_offset %r13, -40 +; AVX1-NEXT: .Lcfi18: +; AVX1-NEXT: .cfi_offset %r14, -32 +; AVX1-NEXT: .Lcfi19: +; AVX1-NEXT: .cfi_offset %r15, -24 +; AVX1-NEXT: movl %edi, (%rsp) +; AVX1-NEXT: movslq (%rsp), %rdx +; AVX1-NEXT: movq %rdx, %rcx +; AVX1-NEXT: shlq $47, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX1-NEXT: movq %rdx, %r8 +; AVX1-NEXT: movq %rdx, %rcx +; AVX1-NEXT: movq %rdx, %rdi +; AVX1-NEXT: movq %rdx, %r13 +; AVX1-NEXT: movq %rdx, %rsi +; AVX1-NEXT: movq %rdx, %r10 +; AVX1-NEXT: movq %rdx, %r11 +; AVX1-NEXT: movq %rdx, %r9 +; AVX1-NEXT: movq %rdx, %rbx +; AVX1-NEXT: movq %rdx, %r14 +; AVX1-NEXT: movq %rdx, %r15 +; AVX1-NEXT: movq %rdx, %r12 +; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: shlq $46, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; AVX1-NEXT: shlq $45, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX1-NEXT: shlq $44, %r8 +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %r8 +; AVX1-NEXT: shlq $43, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %rcx +; AVX1-NEXT: shlq $42, %rdi +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %rdi +; AVX1-NEXT: shlq $41, %r13 +; AVX1-NEXT: sarq $63, %r13 +; AVX1-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %r13 +; AVX1-NEXT: shlq $40, %rsi +; AVX1-NEXT: sarq $63, %rsi +; AVX1-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %rsi +; AVX1-NEXT: shlq $39, %r10 +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %r10 +; AVX1-NEXT: shlq $38, %r11 +; AVX1-NEXT: sarq $63, %r11 +; AVX1-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0 +; AVX1-NEXT: movsbq %dl, %rax +; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX1-NEXT: shlq $37, %r9 +; AVX1-NEXT: sarq $63, %r9 +; AVX1-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %r9 +; AVX1-NEXT: shlq $36, %rbx +; AVX1-NEXT: sarq $63, %rbx +; AVX1-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %rbx +; AVX1-NEXT: shlq $35, %r14 +; AVX1-NEXT: sarq $63, %r14 +; AVX1-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %r14 +; AVX1-NEXT: shlq $34, %r15 +; AVX1-NEXT: sarq $63, %r15 +; AVX1-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %r15 +; AVX1-NEXT: shlq $33, %r12 +; AVX1-NEXT: sarq $63, %r12 +; AVX1-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %r12 +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; AVX1-NEXT: shrq $31, %rax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: shlq $63, %r8 +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: vmovd %r8d, %xmm1 +; AVX1-NEXT: movq %rdx, %r8 +; AVX1-NEXT: movswq %dx, %rdx +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload +; AVX1-NEXT: shlq $62, %r11 +; AVX1-NEXT: sarq $63, %r11 +; AVX1-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $61, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $60, %rdi +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1 +; AVX1-NEXT: shlq $59, %r13 +; AVX1-NEXT: sarq $63, %r13 +; AVX1-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $58, %rsi +; AVX1-NEXT: sarq $63, %rsi +; AVX1-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1 +; AVX1-NEXT: shlq $57, %r10 +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1 +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload +; AVX1-NEXT: shrq $7, %rcx +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $55, %r9 +; AVX1-NEXT: sarq $63, %r9 +; AVX1-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $54, %rbx +; AVX1-NEXT: sarq $63, %rbx +; AVX1-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $53, %r14 +; AVX1-NEXT: sarq $63, %r14 +; AVX1-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $52, %r15 +; AVX1-NEXT: sarq $63, %r15 +; AVX1-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $51, %r12 +; AVX1-NEXT: sarq $63, %r12 +; AVX1-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $50, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX1-NEXT: shlq $49, %r8 +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1 +; AVX1-NEXT: shrq $15, %rdx +; AVX1-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: leaq -40(%rbp), %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i32_32i8: +; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: .Lcfi12: +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: .Lcfi13: +; AVX2-NEXT: .cfi_offset %rbp, -16 +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: .Lcfi14: +; AVX2-NEXT: .cfi_def_cfa_register %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: .Lcfi15: +; AVX2-NEXT: .cfi_offset %rbx, -56 +; AVX2-NEXT: .Lcfi16: +; AVX2-NEXT: .cfi_offset %r12, -48 +; AVX2-NEXT: .Lcfi17: +; AVX2-NEXT: .cfi_offset %r13, -40 +; AVX2-NEXT: .Lcfi18: +; AVX2-NEXT: .cfi_offset %r14, -32 +; AVX2-NEXT: .Lcfi19: +; AVX2-NEXT: .cfi_offset %r15, -24 +; AVX2-NEXT: movl %edi, (%rsp) +; AVX2-NEXT: movslq (%rsp), %rdx +; AVX2-NEXT: movq %rdx, %rcx +; AVX2-NEXT: shlq $47, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX2-NEXT: movq %rdx, %r8 +; AVX2-NEXT: movq %rdx, %rcx +; AVX2-NEXT: movq %rdx, %rdi +; AVX2-NEXT: movq %rdx, %r13 +; AVX2-NEXT: movq %rdx, %rsi +; AVX2-NEXT: movq %rdx, %r10 +; AVX2-NEXT: movq %rdx, %r11 +; AVX2-NEXT: movq %rdx, %r9 +; AVX2-NEXT: movq %rdx, %rbx +; AVX2-NEXT: movq %rdx, %r14 +; AVX2-NEXT: movq %rdx, %r15 +; AVX2-NEXT: movq %rdx, %r12 +; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: shlq $46, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; AVX2-NEXT: shlq $45, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX2-NEXT: shlq $44, %r8 +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %r8 +; AVX2-NEXT: shlq $43, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %rcx +; AVX2-NEXT: shlq $42, %rdi +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %rdi +; AVX2-NEXT: shlq $41, %r13 +; AVX2-NEXT: sarq $63, %r13 +; AVX2-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %r13 +; AVX2-NEXT: shlq $40, %rsi +; AVX2-NEXT: sarq $63, %rsi +; AVX2-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %rsi +; AVX2-NEXT: shlq $39, %r10 +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %r10 +; AVX2-NEXT: shlq $38, %r11 +; AVX2-NEXT: sarq $63, %r11 +; AVX2-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0 +; AVX2-NEXT: movsbq %dl, %rax +; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX2-NEXT: shlq $37, %r9 +; AVX2-NEXT: sarq $63, %r9 +; AVX2-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %r9 +; AVX2-NEXT: shlq $36, %rbx +; AVX2-NEXT: sarq $63, %rbx +; AVX2-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %rbx +; AVX2-NEXT: shlq $35, %r14 +; AVX2-NEXT: sarq $63, %r14 +; AVX2-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %r14 +; AVX2-NEXT: shlq $34, %r15 +; AVX2-NEXT: sarq $63, %r15 +; AVX2-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %r15 +; AVX2-NEXT: shlq $33, %r12 +; AVX2-NEXT: sarq $63, %r12 +; AVX2-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %r12 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; AVX2-NEXT: shrq $31, %rax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: shlq $63, %r8 +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: vmovd %r8d, %xmm1 +; AVX2-NEXT: movq %rdx, %r8 +; AVX2-NEXT: movswq %dx, %rdx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload +; AVX2-NEXT: shlq $62, %r11 +; AVX2-NEXT: sarq $63, %r11 +; AVX2-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $61, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $60, %rdi +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1 +; AVX2-NEXT: shlq $59, %r13 +; AVX2-NEXT: sarq $63, %r13 +; AVX2-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $58, %rsi +; AVX2-NEXT: sarq $63, %rsi +; AVX2-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1 +; AVX2-NEXT: shlq $57, %r10 +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload +; AVX2-NEXT: shrq $7, %rcx +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $55, %r9 +; AVX2-NEXT: sarq $63, %r9 +; AVX2-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $54, %rbx +; AVX2-NEXT: sarq $63, %rbx +; AVX2-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $53, %r14 +; AVX2-NEXT: sarq $63, %r14 +; AVX2-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $52, %r15 +; AVX2-NEXT: sarq $63, %r15 +; AVX2-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $51, %r12 +; AVX2-NEXT: sarq $63, %r12 +; AVX2-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $50, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: shlq $49, %r8 +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1 +; AVX2-NEXT: shrq $15, %rdx +; AVX2-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: leaq -40(%rbp), %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i32_32i8: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k0 +; AVX512-NEXT: vpmovm2b %k0, %ymm0 +; AVX512-NEXT: retq + %1 = bitcast i32 %a0 to <32 x i1> + %2 = sext <32 x i1> %1 to <32 x i8> + ret <32 x i8> %2 +} + +; +; 512-bit vectors +; + +define <8 x i64> @ext_i8_8i64(i8 %a0) { +; SSE2-SSSE3-LABEL: ext_i8_8i64: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: shrl $7, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSE2-SSSE3-NEXT: psllq $63, %xmm0 +; SSE2-SSSE3-NEXT: psrad $31, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3] +; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] +; SSE2-SSSE3-NEXT: psllq $63, %xmm1 +; SSE2-SSSE3-NEXT: psrad $31, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] +; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] +; SSE2-SSSE3-NEXT: psllq $63, %xmm2 +; SSE2-SSSE3-NEXT: psrad $31, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3] +; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7] +; SSE2-SSSE3-NEXT: psllq $63, %xmm3 +; SSE2-SSSE3-NEXT: psrad $31, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i8_8i64: +; AVX1: # BB#0: +; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $2, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $3, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $4, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $5, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $6, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shrl $7, %eax +; AVX1-NEXT: movzwl %ax, %eax +; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i8_8i64: +; AVX2: # BB#0: +; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $2, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $3, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $4, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $5, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $6, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: shrl $7, %eax +; AVX2-NEXT: movzwl %ax, %eax +; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i8_8i64: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: retq + %1 = bitcast i8 %a0 to <8 x i1> + %2 = sext <8 x i1> %1 to <8 x i64> + ret <8 x i64> %2 +} + +define <16 x i32> @ext_i16_16i32(i16 %a0) { +; SSE2-SSSE3-LABEL: ext_i16_16i32: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-SSSE3-NEXT: pslld $31, %xmm0 +; SSE2-SSSE3-NEXT: psrad $31, %xmm0 +; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: pslld $31, %xmm1 +; SSE2-SSSE3-NEXT: psrad $31, %xmm1 +; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-SSSE3-NEXT: pslld $31, %xmm2 +; SSE2-SSSE3-NEXT: psrad $31, %xmm2 +; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-SSSE3-NEXT: pslld $31, %xmm3 +; SSE2-SSSE3-NEXT: psrad $31, %xmm3 +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i16_16i32: +; AVX1: # BB#0: +; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $2, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $3, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $4, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $5, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $6, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $7, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $8, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $9, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $10, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $11, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $12, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $13, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $14, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shrl $15, %eax +; AVX1-NEXT: movzwl %ax, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i16_16i32: +; AVX2: # BB#0: +; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $2, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $3, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $4, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $5, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $6, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $7, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $8, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $9, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $10, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $11, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $12, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $13, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $14, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: shrl $15, %eax +; AVX2-NEXT: movzwl %ax, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i16_16i32: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: retq + %1 = bitcast i16 %a0 to <16 x i1> + %2 = sext <16 x i1> %1 to <16 x i32> + ret <16 x i32> %2 +} + +define <32 x i16> @ext_i32_32i16(i32 %a0) { +; SSE2-SSSE3-LABEL: ext_i32_32i16: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movl %edi, %eax +; SSE2-SSSE3-NEXT: shrl $16, %eax +; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm5 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-SSSE3-NEXT: psllw $15, %xmm0 +; SSE2-SSSE3-NEXT: psraw $15, %xmm0 +; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE2-SSSE3-NEXT: psllw $15, %xmm1 +; SSE2-SSSE3-NEXT: psraw $15, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: psllw $15, %xmm2 +; SSE2-SSSE3-NEXT: psraw $15, %xmm2 +; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-SSSE3-NEXT: psllw $15, %xmm3 +; SSE2-SSSE3-NEXT: psraw $15, %xmm3 +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i32_32i16: +; AVX1: # BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .Lcfi20: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: .Lcfi21: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: .Lcfi22: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $128, %rsp +; AVX1-NEXT: .Lcfi23: +; AVX1-NEXT: .cfi_offset %rbx, -56 +; AVX1-NEXT: .Lcfi24: +; AVX1-NEXT: .cfi_offset %r12, -48 +; AVX1-NEXT: .Lcfi25: +; AVX1-NEXT: .cfi_offset %r13, -40 +; AVX1-NEXT: .Lcfi26: +; AVX1-NEXT: .cfi_offset %r14, -32 +; AVX1-NEXT: .Lcfi27: +; AVX1-NEXT: .cfi_offset %r15, -24 +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, %r13d +; AVX1-NEXT: movl %edi, %r12d +; AVX1-NEXT: movl %edi, %r15d +; AVX1-NEXT: movl %edi, %r14d +; AVX1-NEXT: movl %edi, %ebx +; AVX1-NEXT: movl %edi, %r11d +; AVX1-NEXT: movl %edi, %r10d +; AVX1-NEXT: movl %edi, %r9d +; AVX1-NEXT: movl %edi, %r8d +; AVX1-NEXT: movl %edi, %esi +; AVX1-NEXT: movl %edi, %edx +; AVX1-NEXT: movl %edi, %ecx +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: shrl %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: shrl $2, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shrl $3, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0 +; AVX1-NEXT: shrl $4, %esi +; AVX1-NEXT: andl $1, %esi +; AVX1-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0 +; AVX1-NEXT: shrl $5, %r8d +; AVX1-NEXT: andl $1, %r8d +; AVX1-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0 +; AVX1-NEXT: shrl $6, %r9d +; AVX1-NEXT: andl $1, %r9d +; AVX1-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0 +; AVX1-NEXT: shrl $7, %r10d +; AVX1-NEXT: andl $1, %r10d +; AVX1-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0 +; AVX1-NEXT: shrl $8, %r11d +; AVX1-NEXT: andl $1, %r11d +; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX1-NEXT: shrl $9, %ebx +; AVX1-NEXT: andl $1, %ebx +; AVX1-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; AVX1-NEXT: shrl $10, %r14d +; AVX1-NEXT: andl $1, %r14d +; AVX1-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0 +; AVX1-NEXT: shrl $11, %r15d +; AVX1-NEXT: andl $1, %r15d +; AVX1-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0 +; AVX1-NEXT: shrl $12, %r12d +; AVX1-NEXT: andl $1, %r12d +; AVX1-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0 +; AVX1-NEXT: shrl $13, %r13d +; AVX1-NEXT: andl $1, %r13d +; AVX1-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $14, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $15, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $16, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $17, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $18, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $19, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $20, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $21, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $22, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $23, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $24, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $25, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $26, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $27, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $28, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $29, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $30, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $31, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2 +; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2 +; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX1-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: leaq -40(%rbp), %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i32_32i16: +; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: .Lcfi20: +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: .Lcfi21: +; AVX2-NEXT: .cfi_offset %rbp, -16 +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: .Lcfi22: +; AVX2-NEXT: .cfi_def_cfa_register %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $128, %rsp +; AVX2-NEXT: .Lcfi23: +; AVX2-NEXT: .cfi_offset %rbx, -56 +; AVX2-NEXT: .Lcfi24: +; AVX2-NEXT: .cfi_offset %r12, -48 +; AVX2-NEXT: .Lcfi25: +; AVX2-NEXT: .cfi_offset %r13, -40 +; AVX2-NEXT: .Lcfi26: +; AVX2-NEXT: .cfi_offset %r14, -32 +; AVX2-NEXT: .Lcfi27: +; AVX2-NEXT: .cfi_offset %r15, -24 +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, %r13d +; AVX2-NEXT: movl %edi, %r12d +; AVX2-NEXT: movl %edi, %r15d +; AVX2-NEXT: movl %edi, %r14d +; AVX2-NEXT: movl %edi, %ebx +; AVX2-NEXT: movl %edi, %r11d +; AVX2-NEXT: movl %edi, %r10d +; AVX2-NEXT: movl %edi, %r9d +; AVX2-NEXT: movl %edi, %r8d +; AVX2-NEXT: movl %edi, %esi +; AVX2-NEXT: movl %edi, %edx +; AVX2-NEXT: movl %edi, %ecx +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: shrl %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: shrl $2, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: shrl $3, %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0 +; AVX2-NEXT: shrl $4, %esi +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0 +; AVX2-NEXT: shrl $5, %r8d +; AVX2-NEXT: andl $1, %r8d +; AVX2-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0 +; AVX2-NEXT: shrl $6, %r9d +; AVX2-NEXT: andl $1, %r9d +; AVX2-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: shrl $7, %r10d +; AVX2-NEXT: andl $1, %r10d +; AVX2-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: shrl $8, %r11d +; AVX2-NEXT: andl $1, %r11d +; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX2-NEXT: shrl $9, %ebx +; AVX2-NEXT: andl $1, %ebx +; AVX2-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; AVX2-NEXT: shrl $10, %r14d +; AVX2-NEXT: andl $1, %r14d +; AVX2-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0 +; AVX2-NEXT: shrl $11, %r15d +; AVX2-NEXT: andl $1, %r15d +; AVX2-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0 +; AVX2-NEXT: shrl $12, %r12d +; AVX2-NEXT: andl $1, %r12d +; AVX2-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0 +; AVX2-NEXT: shrl $13, %r13d +; AVX2-NEXT: andl $1, %r13d +; AVX2-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $14, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $15, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $17, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $18, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $19, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $20, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $21, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $22, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $23, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $24, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $25, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $26, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $27, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $28, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $29, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $30, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $31, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0 +; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX2-NEXT: vpsllw $15, %ymm1, %ymm1 +; AVX2-NEXT: vpsraw $15, %ymm1, %ymm1 +; AVX2-NEXT: leaq -40(%rbp), %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i32_32i16: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k0 +; AVX512-NEXT: vpmovm2w %k0, %zmm0 +; AVX512-NEXT: retq + %1 = bitcast i32 %a0 to <32 x i1> + %2 = sext <32 x i1> %1 to <32 x i16> + ret <32 x i16> %2 +} + +define <64 x i8> @ext_i64_64i8(i64 %a0) { +; SSE2-SSSE3-LABEL: ext_i64_64i8: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: pushq %rbp +; SSE2-SSSE3-NEXT: .Lcfi24: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 16 +; SSE2-SSSE3-NEXT: pushq %r15 +; SSE2-SSSE3-NEXT: .Lcfi25: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 24 +; SSE2-SSSE3-NEXT: pushq %r14 +; SSE2-SSSE3-NEXT: .Lcfi26: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 32 +; SSE2-SSSE3-NEXT: pushq %r13 +; SSE2-SSSE3-NEXT: .Lcfi27: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 40 +; SSE2-SSSE3-NEXT: pushq %r12 +; SSE2-SSSE3-NEXT: .Lcfi28: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 48 +; SSE2-SSSE3-NEXT: pushq %rbx +; SSE2-SSSE3-NEXT: .Lcfi29: +; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 56 +; SSE2-SSSE3-NEXT: .Lcfi30: +; SSE2-SSSE3-NEXT: .cfi_offset %rbx, -56 +; SSE2-SSSE3-NEXT: .Lcfi31: +; SSE2-SSSE3-NEXT: .cfi_offset %r12, -48 +; SSE2-SSSE3-NEXT: .Lcfi32: +; SSE2-SSSE3-NEXT: .cfi_offset %r13, -40 +; SSE2-SSSE3-NEXT: .Lcfi33: +; SSE2-SSSE3-NEXT: .cfi_offset %r14, -32 +; SSE2-SSSE3-NEXT: .Lcfi34: +; SSE2-SSSE3-NEXT: .cfi_offset %r15, -24 +; SSE2-SSSE3-NEXT: .Lcfi35: +; SSE2-SSSE3-NEXT: .cfi_offset %rbp, -16 +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movq %rdi, %rax +; SSE2-SSSE3-NEXT: shrq $32, %rax +; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movq %rdi, %rax +; SSE2-SSSE3-NEXT: shrq $48, %rax +; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: shrl $16, %edi +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rbx +; SSE2-SSSE3-NEXT: movq %rbx, %r8 +; SSE2-SSSE3-NEXT: movq %rbx, %r9 +; SSE2-SSSE3-NEXT: movq %rbx, %r10 +; SSE2-SSSE3-NEXT: movq %rbx, %r11 +; SSE2-SSSE3-NEXT: movq %rbx, %r14 +; SSE2-SSSE3-NEXT: movq %rbx, %r15 +; SSE2-SSSE3-NEXT: movq %rbx, %r12 +; SSE2-SSSE3-NEXT: movq %rbx, %r13 +; SSE2-SSSE3-NEXT: movq %rbx, %rdi +; SSE2-SSSE3-NEXT: movq %rbx, %rcx +; SSE2-SSSE3-NEXT: movq %rbx, %rdx +; SSE2-SSSE3-NEXT: movq %rbx, %rsi +; SSE2-SSSE3-NEXT: movq %rbx, %rbp +; SSE2-SSSE3-NEXT: movq %rbx, %rax +; SSE2-SSSE3-NEXT: shrq $15, %rax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: movq %rbx, %rax +; SSE2-SSSE3-NEXT: movsbq %bl, %rbx +; SSE2-SSSE3-NEXT: shlq $49, %r8 +; SSE2-SSSE3-NEXT: sarq $63, %r8 +; SSE2-SSSE3-NEXT: movd %r8d, %xmm15 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-SSSE3-NEXT: shlq $50, %r9 +; SSE2-SSSE3-NEXT: sarq $63, %r9 +; SSE2-SSSE3-NEXT: movd %r9d, %xmm8 +; SSE2-SSSE3-NEXT: shlq $51, %r10 +; SSE2-SSSE3-NEXT: sarq $63, %r10 +; SSE2-SSSE3-NEXT: movd %r10d, %xmm2 +; SSE2-SSSE3-NEXT: shlq $52, %r11 +; SSE2-SSSE3-NEXT: sarq $63, %r11 +; SSE2-SSSE3-NEXT: movd %r11d, %xmm9 +; SSE2-SSSE3-NEXT: shlq $53, %r14 +; SSE2-SSSE3-NEXT: sarq $63, %r14 +; SSE2-SSSE3-NEXT: movd %r14d, %xmm6 +; SSE2-SSSE3-NEXT: shlq $54, %r15 +; SSE2-SSSE3-NEXT: sarq $63, %r15 +; SSE2-SSSE3-NEXT: movd %r15d, %xmm10 +; SSE2-SSSE3-NEXT: shlq $55, %r12 +; SSE2-SSSE3-NEXT: sarq $63, %r12 +; SSE2-SSSE3-NEXT: movd %r12d, %xmm4 +; SSE2-SSSE3-NEXT: shlq $60, %r13 +; SSE2-SSSE3-NEXT: sarq $63, %r13 +; SSE2-SSSE3-NEXT: movd %r13d, %xmm11 +; SSE2-SSSE3-NEXT: shlq $61, %rdi +; SSE2-SSSE3-NEXT: sarq $63, %rdi +; SSE2-SSSE3-NEXT: movd %edi, %xmm5 +; SSE2-SSSE3-NEXT: shlq $62, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm12 +; SSE2-SSSE3-NEXT: shlq $63, %rdx +; SSE2-SSSE3-NEXT: sarq $63, %rdx +; SSE2-SSSE3-NEXT: movd %edx, %xmm0 +; SSE2-SSSE3-NEXT: shlq $58, %rsi +; SSE2-SSSE3-NEXT: sarq $63, %rsi +; SSE2-SSSE3-NEXT: movd %esi, %xmm13 +; SSE2-SSSE3-NEXT: shlq $59, %rbp +; SSE2-SSSE3-NEXT: sarq $63, %rbp +; SSE2-SSSE3-NEXT: movd %ebp, %xmm7 +; SSE2-SSSE3-NEXT: shlq $57, %rax +; SSE2-SSSE3-NEXT: sarq $63, %rax +; SSE2-SSSE3-NEXT: movd %eax, %xmm3 +; SSE2-SSSE3-NEXT: shrq $7, %rbx +; SSE2-SSSE3-NEXT: movd %ebx, %xmm14 +; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi +; SSE2-SSSE3-NEXT: movq %rsi, %r8 +; SSE2-SSSE3-NEXT: movq %rsi, %r9 +; SSE2-SSSE3-NEXT: movq %rsi, %r10 +; SSE2-SSSE3-NEXT: movq %rsi, %r11 +; SSE2-SSSE3-NEXT: movq %rsi, %r14 +; SSE2-SSSE3-NEXT: movq %rsi, %r15 +; SSE2-SSSE3-NEXT: movq %rsi, %r12 +; SSE2-SSSE3-NEXT: movq %rsi, %r13 +; SSE2-SSSE3-NEXT: movq %rsi, %rbx +; SSE2-SSSE3-NEXT: movq %rsi, %rax +; SSE2-SSSE3-NEXT: movq %rsi, %rcx +; SSE2-SSSE3-NEXT: movq %rsi, %rdx +; SSE2-SSSE3-NEXT: movq %rsi, %rdi +; SSE2-SSSE3-NEXT: movq %rsi, %rbp +; SSE2-SSSE3-NEXT: shrq $15, %rbp +; SSE2-SSSE3-NEXT: movd %ebp, %xmm1 +; SSE2-SSSE3-NEXT: movq %rsi, %rbp +; SSE2-SSSE3-NEXT: movsbq %sil, %rsi +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE2-SSSE3-NEXT: shlq $49, %r8 +; SSE2-SSSE3-NEXT: sarq $63, %r8 +; SSE2-SSSE3-NEXT: movd %r8d, %xmm13 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; SSE2-SSSE3-NEXT: shlq $50, %r9 +; SSE2-SSSE3-NEXT: sarq $63, %r9 +; SSE2-SSSE3-NEXT: movd %r9d, %xmm1 +; SSE2-SSSE3-NEXT: shlq $51, %r10 +; SSE2-SSSE3-NEXT: sarq $63, %r10 +; SSE2-SSSE3-NEXT: movd %r10d, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-SSSE3-NEXT: shlq $52, %r11 +; SSE2-SSSE3-NEXT: sarq $63, %r11 +; SSE2-SSSE3-NEXT: movd %r11d, %xmm8 +; SSE2-SSSE3-NEXT: shlq $53, %r14 +; SSE2-SSSE3-NEXT: sarq $63, %r14 +; SSE2-SSSE3-NEXT: movd %r14d, %xmm15 +; SSE2-SSSE3-NEXT: shlq $54, %r15 +; SSE2-SSSE3-NEXT: sarq $63, %r15 +; SSE2-SSSE3-NEXT: movd %r15d, %xmm9 +; SSE2-SSSE3-NEXT: shlq $55, %r12 +; SSE2-SSSE3-NEXT: sarq $63, %r12 +; SSE2-SSSE3-NEXT: movd %r12d, %xmm4 +; SSE2-SSSE3-NEXT: shlq $60, %r13 +; SSE2-SSSE3-NEXT: sarq $63, %r13 +; SSE2-SSSE3-NEXT: movd %r13d, %xmm10 +; SSE2-SSSE3-NEXT: shlq $61, %rbx +; SSE2-SSSE3-NEXT: sarq $63, %rbx +; SSE2-SSSE3-NEXT: movd %ebx, %xmm7 +; SSE2-SSSE3-NEXT: shlq $62, %rax +; SSE2-SSSE3-NEXT: sarq $63, %rax +; SSE2-SSSE3-NEXT: movd %eax, %xmm11 +; SSE2-SSSE3-NEXT: shlq $63, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: shlq $58, %rdx +; SSE2-SSSE3-NEXT: sarq $63, %rdx +; SSE2-SSSE3-NEXT: movd %edx, %xmm12 +; SSE2-SSSE3-NEXT: shlq $59, %rdi +; SSE2-SSSE3-NEXT: sarq $63, %rdi +; SSE2-SSSE3-NEXT: movd %edi, %xmm5 +; SSE2-SSSE3-NEXT: shlq $57, %rbp +; SSE2-SSSE3-NEXT: sarq $63, %rbp +; SSE2-SSSE3-NEXT: movd %ebp, %xmm1 +; SSE2-SSSE3-NEXT: shrq $7, %rsi +; SSE2-SSSE3-NEXT: movd %esi, %xmm14 +; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi +; SSE2-SSSE3-NEXT: movq %rsi, %r8 +; SSE2-SSSE3-NEXT: movq %rsi, %r9 +; SSE2-SSSE3-NEXT: movq %rsi, %r10 +; SSE2-SSSE3-NEXT: movq %rsi, %r11 +; SSE2-SSSE3-NEXT: movq %rsi, %r14 +; SSE2-SSSE3-NEXT: movq %rsi, %r15 +; SSE2-SSSE3-NEXT: movq %rsi, %r12 +; SSE2-SSSE3-NEXT: movq %rsi, %r13 +; SSE2-SSSE3-NEXT: movq %rsi, %rbx +; SSE2-SSSE3-NEXT: movq %rsi, %rax +; SSE2-SSSE3-NEXT: movq %rsi, %rcx +; SSE2-SSSE3-NEXT: movq %rsi, %rdx +; SSE2-SSSE3-NEXT: movq %rsi, %rdi +; SSE2-SSSE3-NEXT: movq %rsi, %rbp +; SSE2-SSSE3-NEXT: shrq $15, %rbp +; SSE2-SSSE3-NEXT: movd %ebp, %xmm6 +; SSE2-SSSE3-NEXT: movq %rsi, %rbp +; SSE2-SSSE3-NEXT: movsbq %sil, %rsi +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE2-SSSE3-NEXT: shlq $49, %r8 +; SSE2-SSSE3-NEXT: sarq $63, %r8 +; SSE2-SSSE3-NEXT: movd %r8d, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE2-SSSE3-NEXT: shlq $50, %r9 +; SSE2-SSSE3-NEXT: sarq $63, %r9 +; SSE2-SSSE3-NEXT: movd %r9d, %xmm3 +; SSE2-SSSE3-NEXT: shlq $51, %r10 +; SSE2-SSSE3-NEXT: sarq $63, %r10 +; SSE2-SSSE3-NEXT: movd %r10d, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-SSSE3-NEXT: shlq $52, %r11 +; SSE2-SSSE3-NEXT: sarq $63, %r11 +; SSE2-SSSE3-NEXT: movd %r11d, %xmm8 +; SSE2-SSSE3-NEXT: shlq $53, %r14 +; SSE2-SSSE3-NEXT: sarq $63, %r14 +; SSE2-SSSE3-NEXT: movd %r14d, %xmm13 +; SSE2-SSSE3-NEXT: shlq $54, %r15 +; SSE2-SSSE3-NEXT: sarq $63, %r15 +; SSE2-SSSE3-NEXT: movd %r15d, %xmm9 +; SSE2-SSSE3-NEXT: shlq $55, %r12 +; SSE2-SSSE3-NEXT: sarq $63, %r12 +; SSE2-SSSE3-NEXT: movd %r12d, %xmm1 +; SSE2-SSSE3-NEXT: shlq $60, %r13 +; SSE2-SSSE3-NEXT: sarq $63, %r13 +; SSE2-SSSE3-NEXT: movd %r13d, %xmm10 +; SSE2-SSSE3-NEXT: shlq $61, %rbx +; SSE2-SSSE3-NEXT: sarq $63, %rbx +; SSE2-SSSE3-NEXT: movd %ebx, %xmm15 +; SSE2-SSSE3-NEXT: shlq $62, %rax +; SSE2-SSSE3-NEXT: sarq $63, %rax +; SSE2-SSSE3-NEXT: movd %eax, %xmm11 +; SSE2-SSSE3-NEXT: shlq $63, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: shlq $58, %rdx +; SSE2-SSSE3-NEXT: sarq $63, %rdx +; SSE2-SSSE3-NEXT: movd %edx, %xmm12 +; SSE2-SSSE3-NEXT: shlq $59, %rdi +; SSE2-SSSE3-NEXT: sarq $63, %rdi +; SSE2-SSSE3-NEXT: movd %edi, %xmm5 +; SSE2-SSSE3-NEXT: shlq $57, %rbp +; SSE2-SSSE3-NEXT: sarq $63, %rbp +; SSE2-SSSE3-NEXT: movd %ebp, %xmm6 +; SSE2-SSSE3-NEXT: shrq $7, %rsi +; SSE2-SSSE3-NEXT: movd %esi, %xmm14 +; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi +; SSE2-SSSE3-NEXT: movq %rsi, %r8 +; SSE2-SSSE3-NEXT: movq %rsi, %r9 +; SSE2-SSSE3-NEXT: movq %rsi, %r10 +; SSE2-SSSE3-NEXT: movq %rsi, %r11 +; SSE2-SSSE3-NEXT: movq %rsi, %r14 +; SSE2-SSSE3-NEXT: movq %rsi, %r15 +; SSE2-SSSE3-NEXT: movq %rsi, %r12 +; SSE2-SSSE3-NEXT: movq %rsi, %r13 +; SSE2-SSSE3-NEXT: movq %rsi, %rbx +; SSE2-SSSE3-NEXT: movq %rsi, %rax +; SSE2-SSSE3-NEXT: movq %rsi, %rcx +; SSE2-SSSE3-NEXT: movq %rsi, %rdx +; SSE2-SSSE3-NEXT: movq %rsi, %rdi +; SSE2-SSSE3-NEXT: movq %rsi, %rbp +; SSE2-SSSE3-NEXT: shrq $15, %rbp +; SSE2-SSSE3-NEXT: movd %ebp, %xmm7 +; SSE2-SSSE3-NEXT: movq %rsi, %rbp +; SSE2-SSSE3-NEXT: movsbq %sil, %rsi +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; SSE2-SSSE3-NEXT: shlq $49, %r8 +; SSE2-SSSE3-NEXT: sarq $63, %r8 +; SSE2-SSSE3-NEXT: movd %r8d, %xmm4 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-SSSE3-NEXT: shlq $50, %r9 +; SSE2-SSSE3-NEXT: sarq $63, %r9 +; SSE2-SSSE3-NEXT: movd %r9d, %xmm6 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE2-SSSE3-NEXT: shlq $51, %r10 +; SSE2-SSSE3-NEXT: sarq $63, %r10 +; SSE2-SSSE3-NEXT: movd %r10d, %xmm5 +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE2-SSSE3-NEXT: shlq $52, %r11 +; SSE2-SSSE3-NEXT: sarq $63, %r11 +; SSE2-SSSE3-NEXT: movd %r11d, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE2-SSSE3-NEXT: shlq $53, %r14 +; SSE2-SSSE3-NEXT: sarq $63, %r14 +; SSE2-SSSE3-NEXT: movd %r14d, %xmm7 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-SSSE3-NEXT: shlq $54, %r15 +; SSE2-SSSE3-NEXT: sarq $63, %r15 +; SSE2-SSSE3-NEXT: movd %r15d, %xmm6 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-SSSE3-NEXT: shlq $55, %r12 +; SSE2-SSSE3-NEXT: sarq $63, %r12 +; SSE2-SSSE3-NEXT: movd %r12d, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; SSE2-SSSE3-NEXT: shlq $60, %r13 +; SSE2-SSSE3-NEXT: sarq $63, %r13 +; SSE2-SSSE3-NEXT: movd %r13d, %xmm8 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE2-SSSE3-NEXT: shlq $61, %rbx +; SSE2-SSSE3-NEXT: sarq $63, %rbx +; SSE2-SSSE3-NEXT: movd %ebx, %xmm6 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE2-SSSE3-NEXT: shlq $62, %rax +; SSE2-SSSE3-NEXT: sarq $63, %rax +; SSE2-SSSE3-NEXT: movd %eax, %xmm7 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-SSSE3-NEXT: shlq $63, %rcx +; SSE2-SSSE3-NEXT: sarq $63, %rcx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2-SSSE3-NEXT: shlq $58, %rdx +; SSE2-SSSE3-NEXT: sarq $63, %rdx +; SSE2-SSSE3-NEXT: movd %edx, %xmm5 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-SSSE3-NEXT: shlq $59, %rdi +; SSE2-SSSE3-NEXT: sarq $63, %rdi +; SSE2-SSSE3-NEXT: movd %edi, %xmm7 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-SSSE3-NEXT: shlq $57, %rbp +; SSE2-SSSE3-NEXT: sarq $63, %rbp +; SSE2-SSSE3-NEXT: movd %ebp, %xmm5 +; SSE2-SSSE3-NEXT: shrq $7, %rsi +; SSE2-SSSE3-NEXT: movd %esi, %xmm6 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE2-SSSE3-NEXT: popq %rbx +; SSE2-SSSE3-NEXT: popq %r12 +; SSE2-SSSE3-NEXT: popq %r13 +; SSE2-SSSE3-NEXT: popq %r14 +; SSE2-SSSE3-NEXT: popq %r15 +; SSE2-SSSE3-NEXT: popq %rbp +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i64_64i8: +; AVX1: # BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .Lcfi28: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: .Lcfi29: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: .Lcfi30: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $128, %rsp +; AVX1-NEXT: .Lcfi31: +; AVX1-NEXT: .cfi_offset %rbx, -56 +; AVX1-NEXT: .Lcfi32: +; AVX1-NEXT: .cfi_offset %r12, -48 +; AVX1-NEXT: .Lcfi33: +; AVX1-NEXT: .cfi_offset %r13, -40 +; AVX1-NEXT: .Lcfi34: +; AVX1-NEXT: .cfi_offset %r14, -32 +; AVX1-NEXT: .Lcfi35: +; AVX1-NEXT: .cfi_offset %r15, -24 +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $32, %rdi +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rdx +; AVX1-NEXT: movq %rdx, %rcx +; AVX1-NEXT: shlq $47, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX1-NEXT: movq %rdx, %r8 +; AVX1-NEXT: movq %rdx, %rcx +; AVX1-NEXT: movq %rdx, %rdi +; AVX1-NEXT: movq %rdx, %r13 +; AVX1-NEXT: movq %rdx, %rsi +; AVX1-NEXT: movq %rdx, %r10 +; AVX1-NEXT: movq %rdx, %r11 +; AVX1-NEXT: movq %rdx, %r9 +; AVX1-NEXT: movq %rdx, %rbx +; AVX1-NEXT: movq %rdx, %r14 +; AVX1-NEXT: movq %rdx, %r15 +; AVX1-NEXT: movq %rdx, %r12 +; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: shlq $46, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; AVX1-NEXT: shlq $45, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX1-NEXT: shlq $44, %r8 +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %r8 +; AVX1-NEXT: shlq $43, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %rcx +; AVX1-NEXT: shlq $42, %rdi +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %rdi +; AVX1-NEXT: shlq $41, %r13 +; AVX1-NEXT: sarq $63, %r13 +; AVX1-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %r13 +; AVX1-NEXT: shlq $40, %rsi +; AVX1-NEXT: sarq $63, %rsi +; AVX1-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %rsi +; AVX1-NEXT: shlq $39, %r10 +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %r10 +; AVX1-NEXT: shlq $38, %r11 +; AVX1-NEXT: sarq $63, %r11 +; AVX1-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0 +; AVX1-NEXT: movsbq %dl, %rax +; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX1-NEXT: shlq $37, %r9 +; AVX1-NEXT: sarq $63, %r9 +; AVX1-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %r9 +; AVX1-NEXT: shlq $36, %rbx +; AVX1-NEXT: sarq $63, %rbx +; AVX1-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %rbx +; AVX1-NEXT: shlq $35, %r14 +; AVX1-NEXT: sarq $63, %r14 +; AVX1-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %r14 +; AVX1-NEXT: shlq $34, %r15 +; AVX1-NEXT: sarq $63, %r15 +; AVX1-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %r15 +; AVX1-NEXT: shlq $33, %r12 +; AVX1-NEXT: sarq $63, %r12 +; AVX1-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %r12 +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; AVX1-NEXT: shrq $31, %rax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: shlq $63, %r8 +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: vmovd %r8d, %xmm1 +; AVX1-NEXT: movq %rdx, %r8 +; AVX1-NEXT: movswq %dx, %rdx +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload +; AVX1-NEXT: shlq $62, %r11 +; AVX1-NEXT: sarq $63, %r11 +; AVX1-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $61, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $60, %rdi +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1 +; AVX1-NEXT: shlq $59, %r13 +; AVX1-NEXT: sarq $63, %r13 +; AVX1-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $58, %rsi +; AVX1-NEXT: sarq $63, %rsi +; AVX1-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1 +; AVX1-NEXT: shlq $57, %r10 +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1 +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload +; AVX1-NEXT: shrq $7, %rcx +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $55, %r9 +; AVX1-NEXT: sarq $63, %r9 +; AVX1-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $54, %rbx +; AVX1-NEXT: sarq $63, %rbx +; AVX1-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $53, %r14 +; AVX1-NEXT: sarq $63, %r14 +; AVX1-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $52, %r15 +; AVX1-NEXT: sarq $63, %r15 +; AVX1-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $51, %r12 +; AVX1-NEXT: sarq $63, %r12 +; AVX1-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $50, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX1-NEXT: shlq $49, %r8 +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1 +; AVX1-NEXT: shrq $15, %rdx +; AVX1-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1 +; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rdx +; AVX1-NEXT: movq %rdx, %rcx +; AVX1-NEXT: shlq $47, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: movq %rdx, %r13 +; AVX1-NEXT: movq %rdx, %rcx +; AVX1-NEXT: movq %rdx, %r9 +; AVX1-NEXT: movq %rdx, %r12 +; AVX1-NEXT: movq %rdx, %rdi +; AVX1-NEXT: movq %rdx, %rbx +; AVX1-NEXT: movq %rdx, %r8 +; AVX1-NEXT: movq %rdx, %r10 +; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX1-NEXT: movq %rdx, %rsi +; AVX1-NEXT: movq %rdx, %r11 +; AVX1-NEXT: movq %rdx, %r14 +; AVX1-NEXT: movq %rdx, %r15 +; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: shlq $46, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX1-NEXT: shlq $45, %r13 +; AVX1-NEXT: sarq $63, %r13 +; AVX1-NEXT: vpinsrb $2, %r13d, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdx, %r13 +; AVX1-NEXT: shlq $44, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdx, %rcx +; AVX1-NEXT: shlq $43, %r9 +; AVX1-NEXT: sarq $63, %r9 +; AVX1-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdx, %r9 +; AVX1-NEXT: shlq $42, %r12 +; AVX1-NEXT: sarq $63, %r12 +; AVX1-NEXT: vpinsrb $5, %r12d, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdx, %r12 +; AVX1-NEXT: shlq $41, %rdi +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: vpinsrb $6, %edi, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdx, %rdi +; AVX1-NEXT: shlq $40, %rbx +; AVX1-NEXT: sarq $63, %rbx +; AVX1-NEXT: vpinsrb $7, %ebx, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdx, %rbx +; AVX1-NEXT: shlq $39, %r8 +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: vpinsrb $8, %r8d, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdx, %r8 +; AVX1-NEXT: shlq $38, %r10 +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: vpinsrb $9, %r10d, %xmm2, %xmm2 +; AVX1-NEXT: movsbq %dl, %rax +; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; AVX1-NEXT: shlq $37, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdx, %r10 +; AVX1-NEXT: shlq $36, %rsi +; AVX1-NEXT: sarq $63, %rsi +; AVX1-NEXT: vpinsrb $11, %esi, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdx, %rsi +; AVX1-NEXT: shlq $35, %r11 +; AVX1-NEXT: sarq $63, %r11 +; AVX1-NEXT: vpinsrb $12, %r11d, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdx, %r11 +; AVX1-NEXT: shlq $34, %r14 +; AVX1-NEXT: sarq $63, %r14 +; AVX1-NEXT: vpinsrb $13, %r14d, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdx, %r14 +; AVX1-NEXT: shlq $33, %r15 +; AVX1-NEXT: sarq $63, %r15 +; AVX1-NEXT: vpinsrb $14, %r15d, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdx, %r15 +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; AVX1-NEXT: shrq $31, %rax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: shlq $63, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vmovd %ecx, %xmm3 +; AVX1-NEXT: movq %rdx, %rcx +; AVX1-NEXT: movswq %dx, %rdx +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: shlq $62, %r13 +; AVX1-NEXT: sarq $63, %r13 +; AVX1-NEXT: vpinsrb $1, %r13d, %xmm3, %xmm1 +; AVX1-NEXT: shlq $61, %r9 +; AVX1-NEXT: sarq $63, %r9 +; AVX1-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $60, %r12 +; AVX1-NEXT: sarq $63, %r12 +; AVX1-NEXT: vpinsrb $3, %r12d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $59, %rdi +; AVX1-NEXT: sarq $63, %rdi +; AVX1-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1 +; AVX1-NEXT: shlq $58, %rbx +; AVX1-NEXT: sarq $63, %rbx +; AVX1-NEXT: vpinsrb $5, %ebx, %xmm1, %xmm1 +; AVX1-NEXT: shlq $57, %r8 +; AVX1-NEXT: sarq $63, %r8 +; AVX1-NEXT: vpinsrb $6, %r8d, %xmm1, %xmm1 +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload +; AVX1-NEXT: shrq $7, %rdi +; AVX1-NEXT: vpinsrb $7, %edi, %xmm1, %xmm1 +; AVX1-NEXT: shlq $55, %r10 +; AVX1-NEXT: sarq $63, %r10 +; AVX1-NEXT: vpinsrb $8, %r10d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $54, %rsi +; AVX1-NEXT: sarq $63, %rsi +; AVX1-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1 +; AVX1-NEXT: shlq $53, %r11 +; AVX1-NEXT: sarq $63, %r11 +; AVX1-NEXT: vpinsrb $10, %r11d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $52, %r14 +; AVX1-NEXT: sarq $63, %r14 +; AVX1-NEXT: vpinsrb $11, %r14d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $51, %r15 +; AVX1-NEXT: sarq $63, %r15 +; AVX1-NEXT: vpinsrb $12, %r15d, %xmm1, %xmm1 +; AVX1-NEXT: shlq $50, %rax +; AVX1-NEXT: sarq $63, %rax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX1-NEXT: shlq $49, %rcx +; AVX1-NEXT: sarq $63, %rcx +; AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: shrq $15, %rdx +; AVX1-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: leaq -40(%rbp), %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i64_64i8: +; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: .Lcfi28: +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: .Lcfi29: +; AVX2-NEXT: .cfi_offset %rbp, -16 +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: .Lcfi30: +; AVX2-NEXT: .cfi_def_cfa_register %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $128, %rsp +; AVX2-NEXT: .Lcfi31: +; AVX2-NEXT: .cfi_offset %rbx, -56 +; AVX2-NEXT: .Lcfi32: +; AVX2-NEXT: .cfi_offset %r12, -48 +; AVX2-NEXT: .Lcfi33: +; AVX2-NEXT: .cfi_offset %r13, -40 +; AVX2-NEXT: .Lcfi34: +; AVX2-NEXT: .cfi_offset %r14, -32 +; AVX2-NEXT: .Lcfi35: +; AVX2-NEXT: .cfi_offset %r15, -24 +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $32, %rdi +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: movq %rdx, %rcx +; AVX2-NEXT: shlq $47, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX2-NEXT: movq %rdx, %r8 +; AVX2-NEXT: movq %rdx, %rcx +; AVX2-NEXT: movq %rdx, %rdi +; AVX2-NEXT: movq %rdx, %r13 +; AVX2-NEXT: movq %rdx, %rsi +; AVX2-NEXT: movq %rdx, %r10 +; AVX2-NEXT: movq %rdx, %r11 +; AVX2-NEXT: movq %rdx, %r9 +; AVX2-NEXT: movq %rdx, %rbx +; AVX2-NEXT: movq %rdx, %r14 +; AVX2-NEXT: movq %rdx, %r15 +; AVX2-NEXT: movq %rdx, %r12 +; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: shlq $46, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; AVX2-NEXT: shlq $45, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX2-NEXT: shlq $44, %r8 +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %r8 +; AVX2-NEXT: shlq $43, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %rcx +; AVX2-NEXT: shlq $42, %rdi +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %rdi +; AVX2-NEXT: shlq $41, %r13 +; AVX2-NEXT: sarq $63, %r13 +; AVX2-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %r13 +; AVX2-NEXT: shlq $40, %rsi +; AVX2-NEXT: sarq $63, %rsi +; AVX2-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %rsi +; AVX2-NEXT: shlq $39, %r10 +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %r10 +; AVX2-NEXT: shlq $38, %r11 +; AVX2-NEXT: sarq $63, %r11 +; AVX2-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0 +; AVX2-NEXT: movsbq %dl, %rax +; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX2-NEXT: shlq $37, %r9 +; AVX2-NEXT: sarq $63, %r9 +; AVX2-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %r9 +; AVX2-NEXT: shlq $36, %rbx +; AVX2-NEXT: sarq $63, %rbx +; AVX2-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %rbx +; AVX2-NEXT: shlq $35, %r14 +; AVX2-NEXT: sarq $63, %r14 +; AVX2-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %r14 +; AVX2-NEXT: shlq $34, %r15 +; AVX2-NEXT: sarq $63, %r15 +; AVX2-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %r15 +; AVX2-NEXT: shlq $33, %r12 +; AVX2-NEXT: sarq $63, %r12 +; AVX2-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %r12 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; AVX2-NEXT: shrq $31, %rax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: shlq $63, %r8 +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: vmovd %r8d, %xmm1 +; AVX2-NEXT: movq %rdx, %r8 +; AVX2-NEXT: movswq %dx, %rdx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload +; AVX2-NEXT: shlq $62, %r11 +; AVX2-NEXT: sarq $63, %r11 +; AVX2-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $61, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $60, %rdi +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1 +; AVX2-NEXT: shlq $59, %r13 +; AVX2-NEXT: sarq $63, %r13 +; AVX2-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $58, %rsi +; AVX2-NEXT: sarq $63, %rsi +; AVX2-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1 +; AVX2-NEXT: shlq $57, %r10 +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload +; AVX2-NEXT: shrq $7, %rcx +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $55, %r9 +; AVX2-NEXT: sarq $63, %r9 +; AVX2-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $54, %rbx +; AVX2-NEXT: sarq $63, %rbx +; AVX2-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $53, %r14 +; AVX2-NEXT: sarq $63, %r14 +; AVX2-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $52, %r15 +; AVX2-NEXT: sarq $63, %r15 +; AVX2-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $51, %r12 +; AVX2-NEXT: sarq $63, %r12 +; AVX2-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $50, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: shlq $49, %r8 +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1 +; AVX2-NEXT: shrq $15, %rdx +; AVX2-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1 +; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: movq %rdx, %rcx +; AVX2-NEXT: shlq $47, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: movq %rdx, %r13 +; AVX2-NEXT: movq %rdx, %rcx +; AVX2-NEXT: movq %rdx, %r9 +; AVX2-NEXT: movq %rdx, %r12 +; AVX2-NEXT: movq %rdx, %rdi +; AVX2-NEXT: movq %rdx, %rbx +; AVX2-NEXT: movq %rdx, %r8 +; AVX2-NEXT: movq %rdx, %r10 +; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX2-NEXT: movq %rdx, %rsi +; AVX2-NEXT: movq %rdx, %r11 +; AVX2-NEXT: movq %rdx, %r14 +; AVX2-NEXT: movq %rdx, %r15 +; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: shlq $46, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX2-NEXT: shlq $45, %r13 +; AVX2-NEXT: sarq $63, %r13 +; AVX2-NEXT: vpinsrb $2, %r13d, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdx, %r13 +; AVX2-NEXT: shlq $44, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdx, %rcx +; AVX2-NEXT: shlq $43, %r9 +; AVX2-NEXT: sarq $63, %r9 +; AVX2-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdx, %r9 +; AVX2-NEXT: shlq $42, %r12 +; AVX2-NEXT: sarq $63, %r12 +; AVX2-NEXT: vpinsrb $5, %r12d, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdx, %r12 +; AVX2-NEXT: shlq $41, %rdi +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: vpinsrb $6, %edi, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdx, %rdi +; AVX2-NEXT: shlq $40, %rbx +; AVX2-NEXT: sarq $63, %rbx +; AVX2-NEXT: vpinsrb $7, %ebx, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdx, %rbx +; AVX2-NEXT: shlq $39, %r8 +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: vpinsrb $8, %r8d, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdx, %r8 +; AVX2-NEXT: shlq $38, %r10 +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: vpinsrb $9, %r10d, %xmm2, %xmm2 +; AVX2-NEXT: movsbq %dl, %rax +; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; AVX2-NEXT: shlq $37, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdx, %r10 +; AVX2-NEXT: shlq $36, %rsi +; AVX2-NEXT: sarq $63, %rsi +; AVX2-NEXT: vpinsrb $11, %esi, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdx, %rsi +; AVX2-NEXT: shlq $35, %r11 +; AVX2-NEXT: sarq $63, %r11 +; AVX2-NEXT: vpinsrb $12, %r11d, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdx, %r11 +; AVX2-NEXT: shlq $34, %r14 +; AVX2-NEXT: sarq $63, %r14 +; AVX2-NEXT: vpinsrb $13, %r14d, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdx, %r14 +; AVX2-NEXT: shlq $33, %r15 +; AVX2-NEXT: sarq $63, %r15 +; AVX2-NEXT: vpinsrb $14, %r15d, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdx, %r15 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; AVX2-NEXT: shrq $31, %rax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: shlq $63, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vmovd %ecx, %xmm3 +; AVX2-NEXT: movq %rdx, %rcx +; AVX2-NEXT: movswq %dx, %rdx +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: shlq $62, %r13 +; AVX2-NEXT: sarq $63, %r13 +; AVX2-NEXT: vpinsrb $1, %r13d, %xmm3, %xmm1 +; AVX2-NEXT: shlq $61, %r9 +; AVX2-NEXT: sarq $63, %r9 +; AVX2-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $60, %r12 +; AVX2-NEXT: sarq $63, %r12 +; AVX2-NEXT: vpinsrb $3, %r12d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $59, %rdi +; AVX2-NEXT: sarq $63, %rdi +; AVX2-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1 +; AVX2-NEXT: shlq $58, %rbx +; AVX2-NEXT: sarq $63, %rbx +; AVX2-NEXT: vpinsrb $5, %ebx, %xmm1, %xmm1 +; AVX2-NEXT: shlq $57, %r8 +; AVX2-NEXT: sarq $63, %r8 +; AVX2-NEXT: vpinsrb $6, %r8d, %xmm1, %xmm1 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload +; AVX2-NEXT: shrq $7, %rdi +; AVX2-NEXT: vpinsrb $7, %edi, %xmm1, %xmm1 +; AVX2-NEXT: shlq $55, %r10 +; AVX2-NEXT: sarq $63, %r10 +; AVX2-NEXT: vpinsrb $8, %r10d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $54, %rsi +; AVX2-NEXT: sarq $63, %rsi +; AVX2-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1 +; AVX2-NEXT: shlq $53, %r11 +; AVX2-NEXT: sarq $63, %r11 +; AVX2-NEXT: vpinsrb $10, %r11d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $52, %r14 +; AVX2-NEXT: sarq $63, %r14 +; AVX2-NEXT: vpinsrb $11, %r14d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $51, %r15 +; AVX2-NEXT: sarq $63, %r15 +; AVX2-NEXT: vpinsrb $12, %r15d, %xmm1, %xmm1 +; AVX2-NEXT: shlq $50, %rax +; AVX2-NEXT: sarq $63, %rax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: shlq $49, %rcx +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: shrq $15, %rdx +; AVX2-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: leaq -40(%rbp), %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i64_64i8: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq %rdi, %k0 +; AVX512-NEXT: vpmovm2b %k0, %zmm0 +; AVX512-NEXT: retq + %1 = bitcast i64 %a0 to <64 x i1> + %2 = sext <64 x i1> %1 to <64 x i8> + ret <64 x i8> %2 +} diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll new file mode 100644 index 0000000000000..aa9e60df14044 --- /dev/null +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -0,0 +1,3279 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512 + +; +; 128-bit vectors +; + +define <2 x i64> @ext_i2_2i64(i2 %a0) { +; SSE2-SSSE3-LABEL: ext_i2_2i64: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: andb $3, %dil +; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movq %rcx, %xmm0 +; SSE2-SSSE3-NEXT: shrl %eax +; SSE2-SSSE3-NEXT: andl $1, %eax +; SSE2-SSSE3-NEXT: movq %rax, %xmm1 +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: retq +; +; AVX12-LABEL: ext_i2_2i64: +; AVX12: # BB#0: +; AVX12-NEXT: andb $3, %dil +; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vmovq %rcx, %xmm0 +; AVX12-NEXT: shrl %eax +; AVX12-NEXT: andl $1, %eax +; AVX12-NEXT: vmovq %rax, %xmm1 +; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX12-NEXT: retq +; +; AVX512-LABEL: ext_i2_2i64: +; AVX512: # BB#0: +; AVX512-NEXT: andb $3, %dil +; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = bitcast i2 %a0 to <2 x i1> + %2 = zext <2 x i1> %1 to <2 x i64> + ret <2 x i64> %2 +} + +define <4 x i32> @ext_i4_4i32(i4 %a0) { +; SSE2-SSSE3-LABEL: ext_i4_4i32: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: andb $15, %dil +; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: shrl %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm2 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i4_4i32: +; AVX1: # BB#0: +; AVX1-NEXT: andb $15, %dil +; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl %ecx +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $2, %ecx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shrl $3, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i4_4i32: +; AVX2: # BB#0: +; AVX2-NEXT: andb $15, %dil +; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl %ecx +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $2, %ecx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i4_4i32: +; AVX512: # BB#0: +; AVX512-NEXT: andb $15, %dil +; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} +; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = bitcast i4 %a0 to <4 x i1> + %2 = zext <4 x i1> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define <8 x i16> @ext_i8_8i16(i8 %a0) { +; SSE2-SSSE3-LABEL: ext_i8_8i16: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: shrl $7, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm3 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-SSSE3-NEXT: retq +; +; AVX12-LABEL: ext_i8_8i16: +; AVX12: # BB#0: +; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: movl %eax, %edx +; AVX12-NEXT: andl $1, %edx +; AVX12-NEXT: vmovd %edx, %xmm0 +; AVX12-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $2, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $3, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $4, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $5, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $6, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: shrl $7, %eax +; AVX12-NEXT: movzwl %ax, %eax +; AVX12-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; AVX12-NEXT: retq +; +; AVX512-LABEL: ext_i8_8i16: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k5 +; AVX512-NEXT: kshiftlw $8, %k5, %k0 +; AVX512-NEXT: kshiftrw $15, %k0, %k0 +; AVX512-NEXT: kshiftlw $9, %k5, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kshiftlw $10, %k5, %k2 +; AVX512-NEXT: kshiftrw $15, %k2, %k2 +; AVX512-NEXT: kshiftlw $11, %k5, %k3 +; AVX512-NEXT: kshiftrw $15, %k3, %k3 +; AVX512-NEXT: kshiftlw $12, %k5, %k4 +; AVX512-NEXT: kshiftrw $15, %k4, %k4 +; AVX512-NEXT: kshiftlw $13, %k5, %k6 +; AVX512-NEXT: kshiftrw $15, %k6, %k6 +; AVX512-NEXT: kshiftlw $15, %k5, %k7 +; AVX512-NEXT: kshiftrw $15, %k7, %k7 +; AVX512-NEXT: kshiftlw $14, %k5, %k5 +; AVX512-NEXT: kshiftrw $15, %k5, %k5 +; AVX512-NEXT: kmovd %k5, %eax +; AVX512-NEXT: andl $1, %eax +; AVX512-NEXT: kmovd %k7, %ecx +; AVX512-NEXT: andl $1, %ecx +; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX512-NEXT: kmovd %k6, %eax +; AVX512-NEXT: andl $1, %eax +; AVX512-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; AVX512-NEXT: kmovd %k4, %eax +; AVX512-NEXT: andl $1, %eax +; AVX512-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX512-NEXT: kmovd %k3, %eax +; AVX512-NEXT: andl $1, %eax +; AVX512-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: andl $1, %eax +; AVX512-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 +; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: andl $1, %eax +; AVX512-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: andl $1, %eax +; AVX512-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; AVX512-NEXT: retq + %1 = bitcast i8 %a0 to <8 x i1> + %2 = zext <8 x i1> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define <16 x i8> @ext_i16_16i8(i16 %a0) { +; SSE2-SSSE3-LABEL: ext_i16_16i8: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: retq +; +; AVX12-LABEL: ext_i16_16i8: +; AVX12: # BB#0: +; AVX12-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; AVX12-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: movl %eax, %edx +; AVX12-NEXT: andl $1, %edx +; AVX12-NEXT: vmovd %edx, %xmm0 +; AVX12-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $2, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $3, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $4, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $5, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $6, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $7, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $8, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $9, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $10, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $11, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $12, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $13, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $14, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: shrl $15, %eax +; AVX12-NEXT: movzwl %ax, %eax +; AVX12-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX12-NEXT: retq +; +; AVX512-LABEL: ext_i16_16i8: +; AVX512: # BB#0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: .Lcfi0: +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: .Lcfi1: +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: .Lcfi2: +; AVX512-NEXT: .cfi_def_cfa_offset 32 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: .Lcfi3: +; AVX512-NEXT: .cfi_def_cfa_offset 40 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: .Lcfi4: +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: .Lcfi5: +; AVX512-NEXT: .cfi_def_cfa_offset 56 +; AVX512-NEXT: .Lcfi6: +; AVX512-NEXT: .cfi_offset %rbx, -56 +; AVX512-NEXT: .Lcfi7: +; AVX512-NEXT: .cfi_offset %r12, -48 +; AVX512-NEXT: .Lcfi8: +; AVX512-NEXT: .cfi_offset %r13, -40 +; AVX512-NEXT: .Lcfi9: +; AVX512-NEXT: .cfi_offset %r14, -32 +; AVX512-NEXT: .Lcfi10: +; AVX512-NEXT: .cfi_offset %r15, -24 +; AVX512-NEXT: .Lcfi11: +; AVX512-NEXT: .cfi_offset %rbp, -16 +; AVX512-NEXT: kmovd %edi, %k0 +; AVX512-NEXT: kshiftlw $14, %k0, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k1, %r8d +; AVX512-NEXT: kshiftlw $15, %k0, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k1, %r9d +; AVX512-NEXT: kshiftlw $13, %k0, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k1, %r10d +; AVX512-NEXT: kshiftlw $12, %k0, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k1, %r11d +; AVX512-NEXT: kshiftlw $11, %k0, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k1, %r14d +; AVX512-NEXT: kshiftlw $10, %k0, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k1, %r15d +; AVX512-NEXT: kshiftlw $9, %k0, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k1, %r12d +; AVX512-NEXT: kshiftlw $8, %k0, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k1, %r13d +; AVX512-NEXT: kshiftlw $7, %k0, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k1, %esi +; AVX512-NEXT: kshiftlw $6, %k0, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k1, %ebx +; AVX512-NEXT: kshiftlw $5, %k0, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k1, %ebp +; AVX512-NEXT: kshiftlw $4, %k0, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k1, %edi +; AVX512-NEXT: kshiftlw $3, %k0, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: kshiftlw $2, %k0, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k1, %ecx +; AVX512-NEXT: kshiftlw $1, %k0, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %k1, %edx +; AVX512-NEXT: kshiftrw $15, %k0, %k0 +; AVX512-NEXT: vmovd %r9d, %xmm0 +; AVX512-NEXT: kmovd %k0, %r9d +; AVX512-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrb $14, %edx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrb $15, %r9d, %xmm0, %xmm0 +; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq + %1 = bitcast i16 %a0 to <16 x i1> + %2 = zext <16 x i1> %1 to <16 x i8> + ret <16 x i8> %2 +} + +; +; 256-bit vectors +; + +define <4 x i64> @ext_i4_4i64(i4 %a0) { +; SSE2-SSSE3-LABEL: ext_i4_4i64: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: andb $15, %dil +; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-SSSE3-NEXT: movd %eax, %xmm2 +; SSE2-SSSE3-NEXT: shrl %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [1,1] +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i4_4i64: +; AVX1: # BB#0: +; AVX1-NEXT: andb $15, %dil +; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $3, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $2, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: shrl %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i4_4i64: +; AVX2: # BB#0: +; AVX2-NEXT: andb $15, %dil +; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $3, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $2, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: shrl %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i4_4i64: +; AVX512: # BB#0: +; AVX512-NEXT: andb $15, %dil +; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; AVX512-NEXT: retq + %1 = bitcast i4 %a0 to <4 x i1> + %2 = zext <4 x i1> %1 to <4 x i64> + ret <4 x i64> %2 +} + +define <8 x i32> @ext_i8_8i32(i8 %a0) { +; SSE2-SSSE3-LABEL: ext_i8_8i32: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: shrl $7, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm3 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i8_8i32: +; AVX1: # BB#0: +; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $5, %ecx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: shrl $4, %edx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $6, %ecx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $7, %ecx +; AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl %ecx +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $2, %ecx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: shrl $3, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i8_8i32: +; AVX2: # BB#0: +; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $5, %ecx +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: shrl $4, %edx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $6, %ecx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $7, %ecx +; AVX2-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl %ecx +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $2, %ecx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i8_8i32: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} +; AVX512-NEXT: retq + %1 = bitcast i8 %a0 to <8 x i1> + %2 = zext <8 x i1> %1 to <8 x i32> + ret <8 x i32> %2 +} + +define <16 x i16> @ext_i16_16i16(i16 %a0) { +; SSE2-SSSE3-LABEL: ext_i16_16i16: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i16_16i16: +; AVX1: # BB#0: +; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $9, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: shrl $8, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $10, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $11, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $12, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $13, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $14, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $15, %ecx +; AVX1-NEXT: movzwl %cx, %ecx +; AVX1-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: vmovd %edx, %xmm1 +; AVX1-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $2, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $3, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $4, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $5, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $6, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: shrl $7, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i16_16i16: +; AVX2: # BB#0: +; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $9, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: shrl $8, %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $10, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $11, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $12, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $13, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $14, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $15, %ecx +; AVX2-NEXT: movzwl %cx, %ecx +; AVX2-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: vmovd %edx, %xmm1 +; AVX2-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $2, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $3, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $4, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $5, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $6, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: shrl $7, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i16_16i16: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z} +; AVX512-NEXT: retq + %1 = bitcast i16 %a0 to <16 x i1> + %2 = zext <16 x i1> %1 to <16 x i16> + ret <16 x i16> %2 +} + +define <32 x i8> @ext_i32_32i8(i32 %a0) { +; SSE2-SSSE3-LABEL: ext_i32_32i8: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: shrl $16, %edi +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm5 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i32_32i8: +; AVX1: # BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .Lcfi0: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: .Lcfi1: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: .Lcfi2: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $32, %rsp +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $17, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movl %edi, %ecx +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $18, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $19, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $20, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $21, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $22, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $23, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $24, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $25, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $26, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $27, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $28, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $29, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $30, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $31, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movl %edi, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm1 +; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $2, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $3, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $4, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $5, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $6, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $7, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $8, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $9, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $10, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $11, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $12, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $13, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $14, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX1-NEXT: shrl $15, %edi +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i32_32i8: +; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: .Lcfi0: +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: .Lcfi1: +; AVX2-NEXT: .cfi_offset %rbp, -16 +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: .Lcfi2: +; AVX2-NEXT: .cfi_def_cfa_register %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $32, %rsp +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $17, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: movl %edi, %ecx +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $18, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $19, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $20, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $21, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $22, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $23, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $24, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $25, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $26, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $27, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $28, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $29, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $30, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $31, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: movl %edi, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $2, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $4, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $5, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $6, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $7, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $8, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $9, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $10, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $11, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $12, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $13, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $14, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX2-NEXT: shrl $15, %edi +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i32_32i8: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} +; AVX512-NEXT: retq + %1 = bitcast i32 %a0 to <32 x i1> + %2 = zext <32 x i1> %1 to <32 x i8> + ret <32 x i8> %2 +} + +; +; 512-bit vectors +; + +define <8 x i64> @ext_i8_8i64(i8 %a0) { +; SSE2-SSSE3-LABEL: ext_i8_8i64: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: shrl $7, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,1] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3] +; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] +; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3] +; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i8_8i64: +; AVX1: # BB#0: +; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $2, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $3, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $4, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $5, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $6, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shrl $7, %eax +; AVX1-NEXT: movzwl %ax, %eax +; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,1,1,1] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i8_8i64: +; AVX2: # BB#0: +; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $2, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $3, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $4, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $5, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $6, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: shrl $7, %eax +; AVX2-NEXT: movzwl %ax, %eax +; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i8_8i64: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: retq + %1 = bitcast i8 %a0 to <8 x i1> + %2 = zext <8 x i1> %1 to <8 x i64> + ret <8 x i64> %2 +} + +define <16 x i32> @ext_i16_16i32(i16 %a0) { +; SSE2-SSSE3-LABEL: ext_i16_16i32: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i16_16i32: +; AVX1: # BB#0: +; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: movl %eax, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: vmovd %edx, %xmm0 +; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $2, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $3, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $4, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $5, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $6, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $7, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $8, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $9, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $10, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $11, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $12, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $13, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $14, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shrl $15, %eax +; AVX1-NEXT: movzwl %ax, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i16_16i32: +; AVX2: # BB#0: +; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: movl %eax, %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: vmovd %edx, %xmm0 +; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $2, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $3, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $4, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $5, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $6, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $7, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $8, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $9, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $10, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $11, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $12, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $13, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $14, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: shrl $15, %eax +; AVX2-NEXT: movzwl %ax, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i16_16i32: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: retq + %1 = bitcast i16 %a0 to <16 x i1> + %2 = zext <16 x i1> %1 to <16 x i32> + ret <16 x i32> %2 +} + +define <32 x i16> @ext_i32_32i16(i32 %a0) { +; SSE2-SSSE3-LABEL: ext_i32_32i16: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movl %edi, %eax +; SSE2-SSSE3-NEXT: shrl $16, %eax +; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm5 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i32_32i16: +; AVX1: # BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .Lcfi3: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: .Lcfi4: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: .Lcfi5: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $128, %rsp +; AVX1-NEXT: .Lcfi6: +; AVX1-NEXT: .cfi_offset %rbx, -56 +; AVX1-NEXT: .Lcfi7: +; AVX1-NEXT: .cfi_offset %r12, -48 +; AVX1-NEXT: .Lcfi8: +; AVX1-NEXT: .cfi_offset %r13, -40 +; AVX1-NEXT: .Lcfi9: +; AVX1-NEXT: .cfi_offset %r14, -32 +; AVX1-NEXT: .Lcfi10: +; AVX1-NEXT: .cfi_offset %r15, -24 +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX1-NEXT: movl %edi, %r13d +; AVX1-NEXT: movl %edi, %r12d +; AVX1-NEXT: movl %edi, %r15d +; AVX1-NEXT: movl %edi, %r14d +; AVX1-NEXT: movl %edi, %ebx +; AVX1-NEXT: movl %edi, %r11d +; AVX1-NEXT: movl %edi, %r10d +; AVX1-NEXT: movl %edi, %r9d +; AVX1-NEXT: movl %edi, %r8d +; AVX1-NEXT: movl %edi, %esi +; AVX1-NEXT: movl %edi, %edx +; AVX1-NEXT: movl %edi, %ecx +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: shrl %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: shrl $2, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shrl $3, %edx +; AVX1-NEXT: andl $1, %edx +; AVX1-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0 +; AVX1-NEXT: shrl $4, %esi +; AVX1-NEXT: andl $1, %esi +; AVX1-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0 +; AVX1-NEXT: shrl $5, %r8d +; AVX1-NEXT: andl $1, %r8d +; AVX1-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0 +; AVX1-NEXT: shrl $6, %r9d +; AVX1-NEXT: andl $1, %r9d +; AVX1-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0 +; AVX1-NEXT: shrl $7, %r10d +; AVX1-NEXT: andl $1, %r10d +; AVX1-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0 +; AVX1-NEXT: shrl $8, %r11d +; AVX1-NEXT: andl $1, %r11d +; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX1-NEXT: shrl $9, %ebx +; AVX1-NEXT: andl $1, %ebx +; AVX1-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; AVX1-NEXT: shrl $10, %r14d +; AVX1-NEXT: andl $1, %r14d +; AVX1-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0 +; AVX1-NEXT: shrl $11, %r15d +; AVX1-NEXT: andl $1, %r15d +; AVX1-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0 +; AVX1-NEXT: shrl $12, %r12d +; AVX1-NEXT: andl $1, %r12d +; AVX1-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0 +; AVX1-NEXT: shrl $13, %r13d +; AVX1-NEXT: andl $1, %r13d +; AVX1-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $14, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $15, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $16, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $17, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $18, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $19, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $20, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $21, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $22, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $23, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $24, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $25, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $26, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $27, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $28, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $29, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $30, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX1-NEXT: shrl $31, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: leaq -40(%rbp), %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i32_32i16: +; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: .Lcfi3: +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: .Lcfi4: +; AVX2-NEXT: .cfi_offset %rbp, -16 +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: .Lcfi5: +; AVX2-NEXT: .cfi_def_cfa_register %rbp +; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $128, %rsp +; AVX2-NEXT: .Lcfi6: +; AVX2-NEXT: .cfi_offset %rbx, -56 +; AVX2-NEXT: .Lcfi7: +; AVX2-NEXT: .cfi_offset %r12, -48 +; AVX2-NEXT: .Lcfi8: +; AVX2-NEXT: .cfi_offset %r13, -40 +; AVX2-NEXT: .Lcfi9: +; AVX2-NEXT: .cfi_offset %r14, -32 +; AVX2-NEXT: .Lcfi10: +; AVX2-NEXT: .cfi_offset %r15, -24 +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill +; AVX2-NEXT: movl %edi, %r13d +; AVX2-NEXT: movl %edi, %r12d +; AVX2-NEXT: movl %edi, %r15d +; AVX2-NEXT: movl %edi, %r14d +; AVX2-NEXT: movl %edi, %ebx +; AVX2-NEXT: movl %edi, %r11d +; AVX2-NEXT: movl %edi, %r10d +; AVX2-NEXT: movl %edi, %r9d +; AVX2-NEXT: movl %edi, %r8d +; AVX2-NEXT: movl %edi, %esi +; AVX2-NEXT: movl %edi, %edx +; AVX2-NEXT: movl %edi, %ecx +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: shrl %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: shrl $2, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: shrl $3, %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0 +; AVX2-NEXT: shrl $4, %esi +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0 +; AVX2-NEXT: shrl $5, %r8d +; AVX2-NEXT: andl $1, %r8d +; AVX2-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0 +; AVX2-NEXT: shrl $6, %r9d +; AVX2-NEXT: andl $1, %r9d +; AVX2-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0 +; AVX2-NEXT: shrl $7, %r10d +; AVX2-NEXT: andl $1, %r10d +; AVX2-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0 +; AVX2-NEXT: shrl $8, %r11d +; AVX2-NEXT: andl $1, %r11d +; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0 +; AVX2-NEXT: shrl $9, %ebx +; AVX2-NEXT: andl $1, %ebx +; AVX2-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; AVX2-NEXT: shrl $10, %r14d +; AVX2-NEXT: andl $1, %r14d +; AVX2-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0 +; AVX2-NEXT: shrl $11, %r15d +; AVX2-NEXT: andl $1, %r15d +; AVX2-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0 +; AVX2-NEXT: shrl $12, %r12d +; AVX2-NEXT: andl $1, %r12d +; AVX2-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0 +; AVX2-NEXT: shrl $13, %r13d +; AVX2-NEXT: andl $1, %r13d +; AVX2-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $14, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $15, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $16, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $17, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $18, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $19, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $20, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $21, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $22, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $23, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $24, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $25, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $26, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $27, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $28, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $29, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $30, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload +; AVX2-NEXT: shrl $31, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: leaq -40(%rbp), %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 +; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i32_32i16: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: retq + %1 = bitcast i32 %a0 to <32 x i1> + %2 = zext <32 x i1> %1 to <32 x i16> + ret <32 x i16> %2 +} + +define <64 x i8> @ext_i64_64i8(i64 %a0) { +; SSE2-SSSE3-LABEL: ext_i64_64i8: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movq %rdi, %rax +; SSE2-SSSE3-NEXT: shrq $32, %rax +; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movq %rdi, %rax +; SSE2-SSSE3-NEXT: shrq $48, %rax +; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: shrl $16, %edi +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm5 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm5 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm5 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm5 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm6 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm5 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm6 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm5 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm6 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm5 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm6 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm5 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm7 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: ext_i64_64i8: +; AVX1: # BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .Lcfi11: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: .Lcfi12: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: .Lcfi13: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $17, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movl %edi, %ecx +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $18, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $19, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $20, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $21, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $22, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $23, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $24, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $25, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $26, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $27, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $28, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $29, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $30, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $31, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movl %edi, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm1 +; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $2, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $3, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $4, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $5, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $6, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $7, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $8, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $9, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $10, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $11, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $12, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $13, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $14, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $15, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $49, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movq %rdi, %rcx +; AVX1-NEXT: shrq $48, %rcx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm1 +; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $50, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $51, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $52, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $53, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $54, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $55, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $56, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $57, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $58, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $59, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $60, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $61, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $62, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $63, %rax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $33, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movq %rdi, %rcx +; AVX1-NEXT: shrq $32, %rcx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $34, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $35, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $36, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $37, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $38, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $39, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $40, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $41, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $42, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $43, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $44, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $45, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $46, %rax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX1-NEXT: shrq $47, %rdi +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: ext_i64_64i8: +; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: .Lcfi11: +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: .Lcfi12: +; AVX2-NEXT: .cfi_offset %rbp, -16 +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: .Lcfi13: +; AVX2-NEXT: .cfi_def_cfa_register %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $17, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: movl %edi, %ecx +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $18, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $19, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $20, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $21, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $22, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $23, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $24, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $25, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $26, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $27, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $28, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $29, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $30, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $31, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: movl %edi, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $2, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $4, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $5, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $6, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $7, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $8, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $9, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $10, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $11, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $12, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $13, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $14, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $15, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $49, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: movq %rdi, %rcx +; AVX2-NEXT: shrq $48, %rcx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $50, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $51, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $52, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $53, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $54, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $55, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $56, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $57, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $58, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $59, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $60, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $61, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $62, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $63, %rax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $33, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: movq %rdi, %rcx +; AVX2-NEXT: shrq $32, %rcx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $34, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $35, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $36, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $37, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $38, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $39, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $40, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $41, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $42, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $43, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $44, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $45, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $46, %rax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX2-NEXT: shrq $47, %rdi +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: ext_i64_64i8: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq %rdi, %k1 +; AVX512-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} +; AVX512-NEXT: retq + %1 = bitcast i64 %a0 to <64 x i1> + %2 = zext <64 x i1> %1 to <64 x i8> + ret <64 x i8> %2 +} diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll new file mode 100644 index 0000000000000..a190e05755228 --- /dev/null +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -0,0 +1,685 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512 + +define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) { +; SSE2-SSSE3-LABEL: bitcast_i2_2i1: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movq %rcx, %xmm0 +; SSE2-SSSE3-NEXT: shrl %eax +; SSE2-SSSE3-NEXT: andl $1, %eax +; SSE2-SSSE3-NEXT: movq %rax, %xmm1 +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: retq +; +; AVX12-LABEL: bitcast_i2_2i1: +; AVX12: # BB#0: +; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vmovq %rcx, %xmm0 +; AVX12-NEXT: shrl %eax +; AVX12-NEXT: andl $1, %eax +; AVX12-NEXT: vmovq %rax, %xmm1 +; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX12-NEXT: retq +; +; AVX512-LABEL: bitcast_i2_2i1: +; AVX512: # BB#0: +; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill> +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = bitcast i2 %a0 to <2 x i1> + ret <2 x i1> %1 +} + +define <4 x i1> @bitcast_i4_4i1(i4 zeroext %a0) { +; SSE2-SSSE3-LABEL: bitcast_i4_4i1: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: shrl %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm2 +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: bitcast_i4_4i1: +; AVX1: # BB#0: +; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl %ecx +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrl $2, %ecx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: shrl $3, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitcast_i4_4i1: +; AVX2: # BB#0: +; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl %ecx +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shrl $2, %ecx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: bitcast_i4_4i1: +; AVX512: # BB#0: +; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = bitcast i4 %a0 to <4 x i1> + ret <4 x i1> %1 +} + +define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) { +; SSE2-SSSE3-LABEL: bitcast_i8_8i1: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: shrl $7, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm3 +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-SSSE3-NEXT: retq +; +; AVX12-LABEL: bitcast_i8_8i1: +; AVX12: # BB#0: +; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: movl %eax, %edx +; AVX12-NEXT: andl $1, %edx +; AVX12-NEXT: vmovd %edx, %xmm0 +; AVX12-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $2, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $3, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $4, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $5, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $6, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: shrl $7, %eax +; AVX12-NEXT: movzwl %ax, %eax +; AVX12-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 +; AVX12-NEXT: retq +; +; AVX512-LABEL: bitcast_i8_8i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k0 +; AVX512-NEXT: vpmovm2w %k0, %xmm0 +; AVX512-NEXT: retq + %1 = bitcast i8 %a0 to <8 x i1> + ret <8 x i1> %1 +} + +define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) { +; SSE2-SSSE3-LABEL: bitcast_i16_16i1: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $7, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $6, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $5, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $4, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $3, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $2, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $11, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $10, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $9, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $8, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm1 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $13, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $12, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm3 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrl $14, %ecx +; SSE2-SSSE3-NEXT: andl $1, %ecx +; SSE2-SSSE3-NEXT: movd %ecx, %xmm2 +; SSE2-SSSE3-NEXT: shrl $15, %eax +; SSE2-SSSE3-NEXT: movzwl %ax, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm4 +; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-SSSE3-NEXT: retq +; +; AVX12-LABEL: bitcast_i16_16i1: +; AVX12: # BB#0: +; AVX12-NEXT: movw %di, -{{[0-9]+}}(%rsp) +; AVX12-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: movl %eax, %edx +; AVX12-NEXT: andl $1, %edx +; AVX12-NEXT: vmovd %edx, %xmm0 +; AVX12-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $2, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $3, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $4, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $5, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $6, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $7, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $8, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $9, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $10, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $11, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $12, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $13, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: movl %eax, %ecx +; AVX12-NEXT: shrl $14, %ecx +; AVX12-NEXT: andl $1, %ecx +; AVX12-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX12-NEXT: shrl $15, %eax +; AVX12-NEXT: movzwl %ax, %eax +; AVX12-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX12-NEXT: retq +; +; AVX512-LABEL: bitcast_i16_16i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k0 +; AVX512-NEXT: vpmovm2b %k0, %xmm0 +; AVX512-NEXT: retq + %1 = bitcast i16 %a0 to <16 x i1> + ret <16 x i1> %1 +} + +define <32 x i1> @bitcast_i32_32i1(i32 %a0) { +; SSE2-SSSE3-LABEL: bitcast_i32_32i1: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movl %esi, (%rdi) +; SSE2-SSSE3-NEXT: movq %rdi, %rax +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: bitcast_i32_32i1: +; AVX1: # BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .Lcfi0: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: .Lcfi1: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: .Lcfi2: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $32, %rsp +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $17, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movl %edi, %ecx +; AVX1-NEXT: shrl $16, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $18, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $19, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $20, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $21, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $22, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $23, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $24, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $25, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $26, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $27, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $28, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $29, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $30, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $31, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movl %edi, %ecx +; AVX1-NEXT: andl $1, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm1 +; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $2, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $3, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $4, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $5, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $6, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $7, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $8, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $9, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $10, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $11, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $12, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $13, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX1-NEXT: movl %edi, %eax +; AVX1-NEXT: shrl $14, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX1-NEXT: shrl $15, %edi +; AVX1-NEXT: andl $1, %edi +; AVX1-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitcast_i32_32i1: +; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: .Lcfi0: +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: .Lcfi1: +; AVX2-NEXT: .cfi_offset %rbp, -16 +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: .Lcfi2: +; AVX2-NEXT: .cfi_def_cfa_register %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $32, %rsp +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $17, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: movl %edi, %ecx +; AVX2-NEXT: shrl $16, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $18, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $19, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $20, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $21, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $22, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $23, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $24, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $25, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $26, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $27, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $28, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $29, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $30, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $31, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: movl %edi, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $2, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $3, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $4, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $5, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $6, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $7, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $8, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $9, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $10, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $11, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $12, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $13, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: shrl $14, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX2-NEXT: shrl $15, %edi +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512-LABEL: bitcast_i32_32i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovd %edi, %k0 +; AVX512-NEXT: vpmovm2b %k0, %ymm0 +; AVX512-NEXT: retq + %1 = bitcast i32 %a0 to <32 x i1> + ret <32 x i1> %1 +} + +define <64 x i1> @bitcast_i64_64i1(i64 %a0) { +; SSE2-SSSE3-LABEL: bitcast_i64_64i1: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movq %rsi, (%rdi) +; SSE2-SSSE3-NEXT: movq %rdi, %rax +; SSE2-SSSE3-NEXT: retq +; +; AVX12-LABEL: bitcast_i64_64i1: +; AVX12: # BB#0: +; AVX12-NEXT: movq %rsi, (%rdi) +; AVX12-NEXT: movq %rdi, %rax +; AVX12-NEXT: retq +; +; AVX512-LABEL: bitcast_i64_64i1: +; AVX512: # BB#0: +; AVX512-NEXT: kmovq %rdi, %k0 +; AVX512-NEXT: vpmovm2b %k0, %zmm0 +; AVX512-NEXT: retq + %1 = bitcast i64 %a0 to <64 x i1> + ret <64 x i1> %1 +} diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll index 9bf7b41a4f26a..5616276da08d0 100644 --- a/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-setcc-128.ll @@ -1,41 +1,41 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSE2 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+ssse3 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSSE3 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,AVX12,AVX1 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,AVX12,AVX2 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512 define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: v8i16: -; SSE2: ## BB#0: +; SSE2: # BB#0: ; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i16: -; SSSE3: ## BB#0: +; SSSE3: # BB#0: ; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSSE3-NEXT: retq ; ; AVX12-LABEL: v8i16: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX12-NEXT: vpmovmskb %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX512-NEXT: retq %x = icmp sgt <8 x i16> %a, %b %res = bitcast <8 x i1> %x to i8 @@ -44,21 +44,21 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) { define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) { ; SSE2-SSSE3-LABEL: v4i32: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i32: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vmovmskps %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) @@ -71,21 +71,21 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) { define i4 @v4f32(<4 x float> %a, <4 x float> %b) { ; SSE2-SSSE3-LABEL: v4f32: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4f32: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 ; AVX12-NEXT: vmovmskps %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4f32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) @@ -98,24 +98,24 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b) { define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) { ; SSE2-SSSE3-LABEL: v16i8: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v16i8: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpmovmskb %xmm0, %eax -; AVX12-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; AVX512-NEXT: retq %x = icmp sgt <16 x i8> %a, %b %res = bitcast <16 x i1> %x to i16 @@ -124,7 +124,7 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) { define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { ; SSE2-SSSE3-LABEL: v2i8: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: psllq $56, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: psrad $31, %xmm2 @@ -151,11 +151,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i8: -; AVX1: ## BB#0: +; AVX1: # BB#0: ; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 ; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 @@ -168,11 +168,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i8: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 ; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1 @@ -185,11 +185,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllq $56, %xmm1, %xmm1 ; AVX512-NEXT: vpsraq $56, %xmm1, %xmm1 ; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0 @@ -206,7 +206,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { ; SSE2-SSSE3-LABEL: v2i16: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: psllq $48, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: psrad $31, %xmm2 @@ -233,11 +233,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i16: -; AVX1: ## BB#0: +; AVX1: # BB#0: ; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 ; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 @@ -250,11 +250,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i16: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 ; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 @@ -267,11 +267,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX512-NEXT: vpsraq $48, %xmm1, %xmm1 ; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0 @@ -288,7 +288,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; SSE2-SSSE3-LABEL: v2i32: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: psllq $32, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] ; SSE2-SSSE3-NEXT: psrad $31, %xmm0 @@ -311,11 +311,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i32: -; AVX1: ## BB#0: +; AVX1: # BB#0: ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] @@ -326,11 +326,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovmskpd %xmm0, %eax -; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i32: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] @@ -341,11 +341,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovmskpd %xmm0, %eax -; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1 ; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 @@ -362,7 +362,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE2-SSSE3-LABEL: v2i64: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0] ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 @@ -375,18 +375,18 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v2i64: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vmovmskpd %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v2i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) @@ -399,21 +399,21 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) { define i2 @v2f64(<2 x double> %a, <2 x double> %b) { ; SSE2-SSSE3-LABEL: v2f64: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v2f64: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 ; AVX12-NEXT: vmovmskpd %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v2f64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) @@ -426,29 +426,29 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b) { define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) { ; SSE2-SSSE3-LABEL: v4i8: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: pslld $24, %xmm1 ; SSE2-SSSE3-NEXT: psrad $24, %xmm1 ; SSE2-SSSE3-NEXT: pslld $24, %xmm0 ; SSE2-SSSE3-NEXT: psrad $24, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i8: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vpslld $24, %xmm1, %xmm1 ; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1 ; AVX12-NEXT: vpslld $24, %xmm0, %xmm0 ; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vmovmskps %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpslld $24, %xmm1, %xmm1 ; AVX512-NEXT: vpsrad $24, %xmm1, %xmm1 ; AVX512-NEXT: vpslld $24, %xmm0, %xmm0 @@ -465,29 +465,29 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) { define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) { ; SSE2-SSSE3-LABEL: v4i16: -; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3: # BB#0: ; SSE2-SSSE3-NEXT: pslld $16, %xmm1 ; SSE2-SSSE3-NEXT: psrad $16, %xmm1 ; SSE2-SSSE3-NEXT: pslld $16, %xmm0 ; SSE2-SSSE3-NEXT: psrad $16, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax -; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i16: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vpslld $16, %xmm1, %xmm1 ; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX12-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vmovmskps %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpslld $16, %xmm1, %xmm1 ; AVX512-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX512-NEXT: vpslld $16, %xmm0, %xmm0 @@ -504,7 +504,7 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) { define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) { ; SSE2-LABEL: v8i8: -; SSE2: ## BB#0: +; SSE2: # BB#0: ; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: psraw $8, %xmm1 ; SSE2-NEXT: psllw $8, %xmm0 @@ -513,11 +513,11 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) { ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-NEXT: retq ; ; SSSE3-LABEL: v8i8: -; SSSE3: ## BB#0: +; SSSE3: # BB#0: ; SSSE3-NEXT: psllw $8, %xmm1 ; SSSE3-NEXT: psraw $8, %xmm1 ; SSSE3-NEXT: psllw $8, %xmm0 @@ -525,11 +525,11 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) { ; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; SSSE3-NEXT: retq ; ; AVX12-LABEL: v8i8: -; AVX12: ## BB#0: +; AVX12: # BB#0: ; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1 ; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0 @@ -537,18 +537,18 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) { ; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX12-NEXT: vpmovmskb %xmm0, %eax -; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1 ; AVX512-NEXT: vpsllw $8, %xmm0, %xmm0 ; AVX512-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX512-NEXT: retq %x = icmp sgt <8 x i8> %a, %b %res = bitcast <8 x i1> %x to i8 diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll index b2c619c48d4d3..86475c42e79e7 100644 --- a/test/CodeGen/X86/bitcast-setcc-256.ll +++ b/test/CodeGen/X86/bitcast-setcc-256.ll @@ -1,23 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX2 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSE2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSSE3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX512 define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) { +; SSE2-SSSE3-LABEL: v16i16: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtw %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v16i16: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovmskb %xmm0, %eax -; AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = icmp sgt <16 x i16> %a, %b @@ -26,19 +50,53 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) { } define i8 @v8i32(<8 x i32> %a, <8 x i32> %b) { +; SSE2-LABEL: v8i32: +; SSE2: # BB#0: +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: packsswb %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i32: +; SSSE3: # BB#0: +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: packsswb %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: retq +; +; AVX1-LABEL: v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v8i32: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovmskps %ymm0, %eax -; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: v8i32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = icmp sgt <8 x i32> %a, %b @@ -47,19 +105,51 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b) { } define i8 @v8f32(<8 x float> %a, <8 x float> %b) { +; SSE2-LABEL: v8f32: +; SSE2: # BB#0: +; SSE2-NEXT: cmpltps %xmm1, %xmm3 +; SSE2-NEXT: cmpltps %xmm0, %xmm2 +; SSE2-NEXT: packsswb %xmm3, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8f32: +; SSSE3: # BB#0: +; SSSE3-NEXT: cmpltps %xmm1, %xmm3 +; SSSE3-NEXT: cmpltps %xmm0, %xmm2 +; SSSE3-NEXT: packsswb %xmm3, %xmm2 +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm2, %eax +; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: retq +; +; AVX1-LABEL: v8f32: +; AVX1: # BB#0: +; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v8f32: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovmskps %ymm0, %eax -; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: v8f32: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = fcmp ogt <8 x float> %a, %b @@ -68,15 +158,241 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b) { } define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) { +; SSE2-SSSE3-LABEL: v32i8: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: pcmpgtb %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-SSSE3-NEXT: andb $1, %cl +; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-SSSE3-NEXT: andb $1, %cl +; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; SSE2-SSSE3-NEXT: shll $16, %ecx +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: orl %ecx, %eax +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: v32i8: +; AVX1: # BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .Lcfi0: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: .Lcfi1: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: .Lcfi2: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $32, %rsp +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpextrb $15, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $14, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $13, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $11, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $9, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $7, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $5, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $3, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $1, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm2, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: movl (%rsp), %eax +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v32i8: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: v32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: vzeroupper @@ -87,16 +403,56 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) { } define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) { +; SSE2-SSSE3-LABEL: v4i64: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0] +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: packsswb %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskps %xmm0, %eax +; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v4i64: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovmskpd %ymm0, %eax -; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: v4i64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) @@ -109,16 +465,35 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) { } define i4 @v4f64(<4 x double> %a, <4 x double> %b) { +; SSE2-SSSE3-LABEL: v4f64: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: cmpltpd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: packsswb %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax +; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: v4f64: +; AVX1: # BB#0: +; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskps %xmm0, %eax +; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v4f64: -; AVX2: ## BB#0: +; AVX2: # BB#0: ; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovmskpd %ymm0, %eax -; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: v4f64: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) diff --git a/test/CodeGen/X86/bitcast-setcc-512.ll b/test/CodeGen/X86/bitcast-setcc-512.ll new file mode 100644 index 0000000000000..4a5ef99a86537 --- /dev/null +++ b/test/CodeGen/X86/bitcast-setcc-512.ll @@ -0,0 +1,1377 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW + +define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) { +; SSE-LABEL: v32i16: +; SSE: # BB#0: +; SSE-NEXT: pcmpgtw %xmm7, %xmm3 +; SSE-NEXT: pextrb $14, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $12, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $10, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $8, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $6, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $4, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $2, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $0, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pcmpgtw %xmm6, %xmm2 +; SSE-NEXT: pextrb $14, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $12, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $10, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $8, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $6, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $4, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $2, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $0, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pcmpgtw %xmm5, %xmm1 +; SSE-NEXT: pextrb $14, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $12, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $10, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $8, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $6, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $4, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $2, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $0, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pcmpgtw %xmm4, %xmm0 +; SSE-NEXT: pextrb $14, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $12, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $10, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $8, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $6, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $4, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $2, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; SSE-NEXT: shll $16, %ecx +; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: orl %ecx, %eax +; SSE-NEXT: retq +; +; AVX1-LABEL: v32i16: +; AVX1: # BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .Lcfi0: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: .Lcfi1: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: .Lcfi2: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $32, %rsp +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpextrb $14, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpextrb $14, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpextrb $14, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm1, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: movl (%rsp), %eax +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: v32i16: +; AVX2: # BB#0: +; AVX2-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v32i16: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: .Lcfi0: +; AVX512F-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-NEXT: .Lcfi1: +; AVX512F-NEXT: .cfi_offset %rbp, -16 +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: .Lcfi2: +; AVX512F-NEXT: .cfi_def_cfa_register %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $32, %rsp +; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm1 +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kshiftlw $14, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: kshiftlw $15, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $13, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $12, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $11, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $10, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $9, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $8, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $7, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $6, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $5, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $4, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $3, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $2, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftlw $1, %k0, %k1 +; AVX512F-NEXT: kshiftrw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, (%rsp) +; AVX512F-NEXT: movl (%rsp), %eax +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v32i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %x = icmp sgt <32 x i16> %a, %b + %res = bitcast <32 x i1> %x to i32 + ret i32 %res +} + +define i16 @v16i32(<16 x i32> %a, <16 x i32> %b) { +; SSE-LABEL: v16i32: +; SSE: # BB#0: +; SSE-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; SSE-NEXT: retq +; +; AVX1-LABEL: v16i32: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v16i32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v16i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %x = icmp sgt <16 x i32> %a, %b + %res = bitcast <16 x i1> %x to i16 + ret i16 %res +} + +define i16 @v16f32(<16 x float> %a, <16 x float> %b) { +; SSE-LABEL: v16f32: +; SSE: # BB#0: +; SSE-NEXT: cmpltps %xmm3, %xmm7 +; SSE-NEXT: cmpltps %xmm2, %xmm6 +; SSE-NEXT: packsswb %xmm7, %xmm6 +; SSE-NEXT: cmpltps %xmm1, %xmm5 +; SSE-NEXT: cmpltps %xmm0, %xmm4 +; SSE-NEXT: packsswb %xmm5, %xmm4 +; SSE-NEXT: packsswb %xmm6, %xmm4 +; SSE-NEXT: pmovmskb %xmm4, %eax +; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; SSE-NEXT: retq +; +; AVX1-LABEL: v16f32: +; AVX1: # BB#0: +; AVX1-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16f32: +; AVX2: # BB#0: +; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v16f32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v16f32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %x = fcmp ogt <16 x float> %a, %b + %res = bitcast <16 x i1> %x to i16 + ret i16 %res +} + +define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) { +; SSE-LABEL: v64i8: +; SSE: # BB#0: +; SSE-NEXT: pcmpgtb %xmm5, %xmm1 +; SSE-NEXT: pextrb $15, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $14, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $13, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $12, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $11, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $10, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $9, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $8, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $7, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $6, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $5, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $4, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $3, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $2, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $1, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $0, %xmm1, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pcmpgtb %xmm4, %xmm0 +; SSE-NEXT: pextrb $15, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $14, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $13, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $12, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $11, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $10, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $9, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $8, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $7, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $6, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $5, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $4, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $3, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $2, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $1, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $0, %xmm0, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pcmpgtb %xmm7, %xmm3 +; SSE-NEXT: pextrb $15, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $14, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $13, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $12, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $11, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $10, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $9, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $8, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $7, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $6, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $5, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $4, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $3, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $2, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $1, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $0, %xmm3, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pcmpgtb %xmm6, %xmm2 +; SSE-NEXT: pextrb $15, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $14, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $13, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $12, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $11, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $10, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $9, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $8, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $7, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $6, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $5, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $4, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $3, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $2, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $1, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: pextrb $0, %xmm2, %eax +; SSE-NEXT: andb $1, %al +; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: shll $16, %eax +; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx +; SSE-NEXT: orl %eax, %ecx +; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; SSE-NEXT: shll $16, %edx +; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: orl %edx, %eax +; SSE-NEXT: shlq $32, %rax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: v64i8: +; AVX1: # BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .Lcfi3: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: .Lcfi4: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: .Lcfi5: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpextrb $15, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $14, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $13, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $11, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $9, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $7, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $5, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $3, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $1, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm4, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm0 +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX1-NEXT: movl (%rsp), %ecx +; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: shlq $32, %rax +; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: v64i8: +; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: .Lcfi0: +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: .Lcfi1: +; AVX2-NEXT: .cfi_offset %rbp, -16 +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: .Lcfi2: +; AVX2-NEXT: .cfi_def_cfa_register %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpextrb $15, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $14, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $13, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $12, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $11, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $10, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $9, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $8, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $7, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $6, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $5, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $4, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $3, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $2, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $1, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $0, %xmm2, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $13, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $11, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $9, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $7, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $5, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, (%rsp) +; AVX2-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrb $15, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $14, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $13, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $12, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $11, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $10, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $9, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $8, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $7, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $6, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $5, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $4, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $3, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $2, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $1, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $0, %xmm1, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $13, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $11, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $9, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $7, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $5, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: movb %al, {{[0-9]+}}(%rsp) +; AVX2-NEXT: movl (%rsp), %ecx +; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: shlq $32, %rax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v64i8: +; AVX512F: # BB#0: +; AVX512F-NEXT: pushq %rbp +; AVX512F-NEXT: .Lcfi3: +; AVX512F-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-NEXT: .Lcfi4: +; AVX512F-NEXT: .cfi_offset %rbp, -16 +; AVX512F-NEXT: movq %rsp, %rbp +; AVX512F-NEXT: .Lcfi5: +; AVX512F-NEXT: .cfi_def_cfa_register %rbp +; AVX512F-NEXT: andq $-32, %rsp +; AVX512F-NEXT: subq $64, %rsp +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0 +; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, (%rsp) +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: movl (%rsp), %ecx +; AVX512F-NEXT: movl {{[0-9]+}}(%rsp), %eax +; AVX512F-NEXT: shlq $32, %rax +; AVX512F-NEXT: orq %rcx, %rax +; AVX512F-NEXT: movq %rbp, %rsp +; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v64i8: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovq %k0, %rax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %x = icmp sgt <64 x i8> %a, %b + %res = bitcast <64 x i1> %x to i64 + ret i64 %res +} + +define i8 @v8i64(<8 x i64> %a, <8 x i64> %b) { +; SSE-LABEL: v8i64: +; SSE: # BB#0: +; SSE-NEXT: pcmpgtq %xmm7, %xmm3 +; SSE-NEXT: pcmpgtq %xmm6, %xmm2 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: pcmpgtq %xmm5, %xmm1 +; SSE-NEXT: pcmpgtq %xmm4, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; SSE-NEXT: retq +; +; AVX1-LABEL: v8i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: v8i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v8i64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v8i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %x = icmp sgt <8 x i64> %a, %b + %res = bitcast <8 x i1> %x to i8 + ret i8 %res +} + +define i8 @v8f64(<8 x double> %a, <8 x double> %b) { +; SSE-LABEL: v8f64: +; SSE: # BB#0: +; SSE-NEXT: cmpltpd %xmm3, %xmm7 +; SSE-NEXT: cmpltpd %xmm2, %xmm6 +; SSE-NEXT: packsswb %xmm7, %xmm6 +; SSE-NEXT: cmpltpd %xmm1, %xmm5 +; SSE-NEXT: cmpltpd %xmm0, %xmm4 +; SSE-NEXT: packsswb %xmm5, %xmm4 +; SSE-NEXT: packsswb %xmm6, %xmm4 +; SSE-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE-NEXT: pmovmskb %xmm4, %eax +; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; SSE-NEXT: retq +; +; AVX1-LABEL: v8f64: +; AVX1: # BB#0: +; AVX1-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: v8f64: +; AVX2: # BB#0: +; AVX2-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v8f64: +; AVX512F: # BB#0: +; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v8f64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %x = fcmp ogt <8 x double> %a, %b + %res = bitcast <8 x i1> %x to i8 + ret i8 %res +} diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll index c7de65d84507b..b3f6534d14b3b 100644 --- a/test/CodeGen/X86/block-placement.ll +++ b/test/CodeGen/X86/block-placement.ll @@ -354,6 +354,7 @@ define void @unnatural_cfg2() { ; single-source GCC. ; CHECK-LABEL: unnatural_cfg2 ; CHECK: %entry +; CHECK: %loop.header ; CHECK: %loop.body1 ; CHECK: %loop.body2 ; CHECK: %loop.body4 @@ -361,7 +362,6 @@ define void @unnatural_cfg2() { ; CHECK: %loop.inner2.begin ; CHECK: %loop.body3 ; CHECK: %loop.inner1.begin -; CHECK: %loop.header ; CHECK: %bail entry: @@ -1491,6 +1491,102 @@ ret: ; preds = %endif, %then ret void } +define i32 @not_rotate_if_extra_branch(i32 %count) { +; Test checks that there is no loop rotation +; if it introduces extra branch. +; Specifically in this case because best exit is .header +; but it has fallthrough to .middle block and last block in +; loop chain .slow does not have afallthrough to .header. +; CHECK-LABEL: not_rotate_if_extra_branch +; CHECK: %.entry +; CHECK: %.header +; CHECK: %.middle +; CHECK: %.backedge +; CHECK: %.slow +; CHECK: %.bailout +; CHECK: %.stop +.entry: + %sum.0 = shl nsw i32 %count, 1 + br label %.header + +.header: + %i = phi i32 [ %i.1, %.backedge ], [ 0, %.entry ] + %sum = phi i32 [ %sum.1, %.backedge ], [ %sum.0, %.entry ] + %is_exc = icmp sgt i32 %i, 9000000 + br i1 %is_exc, label %.bailout, label %.middle, !prof !13 + +.bailout: + %sum.2 = add nsw i32 %count, 1 + br label %.stop + +.middle: + %pr.1 = and i32 %i, 1023 + %pr.2 = icmp eq i32 %pr.1, 0 + br i1 %pr.2, label %.slow, label %.backedge, !prof !14 + +.slow: + tail call void @effect(i32 %sum) + br label %.backedge + +.backedge: + %sum.1 = add nsw i32 %i, %sum + %i.1 = add nsw i32 %i, 1 + %end = icmp slt i32 %i.1, %count + br i1 %end, label %.header, label %.stop, !prof !15 + +.stop: + %sum.phi = phi i32 [ %sum.1, %.backedge ], [ %sum.2, %.bailout ] + ret i32 %sum.phi +} + +define i32 @not_rotate_if_extra_branch_regression(i32 %count, i32 %init) { +; This is a regression test against patch avoid loop rotation if +; it introduce an extra btanch. +; CHECK-LABEL: not_rotate_if_extra_branch_regression +; CHECK: %.entry +; CHECK: %.first_backedge +; CHECK: %.slow +; CHECK: %.second_header +.entry: + %sum.0 = shl nsw i32 %count, 1 + br label %.first_header + +.first_header: + %i = phi i32 [ %i.1, %.first_backedge ], [ 0, %.entry ] + %is_bo1 = icmp sgt i32 %i, 9000000 + br i1 %is_bo1, label %.bailout, label %.first_backedge, !prof !14 + +.first_backedge: + %i.1 = add nsw i32 %i, 1 + %end = icmp slt i32 %i.1, %count + br i1 %end, label %.first_header, label %.second_header, !prof !13 + +.second_header: + %j = phi i32 [ %j.1, %.second_backedge ], [ %init, %.first_backedge ] + %end.2 = icmp sgt i32 %j, %count + br i1 %end.2, label %.stop, label %.second_middle, !prof !14 + +.second_middle: + %is_slow = icmp sgt i32 %j, 9000000 + br i1 %is_slow, label %.slow, label %.second_backedge, !prof !14 + +.slow: + tail call void @effect(i32 %j) + br label %.second_backedge + +.second_backedge: + %j.1 = add nsw i32 %j, 1 + %end.3 = icmp slt i32 %j, 10000000 + br i1 %end.3, label %.second_header, label %.stop, !prof !13 + +.stop: + %res = add nsw i32 %j, %i.1 + ret i32 %res + +.bailout: + ret i32 0 +} + declare void @effect(i32) !5 = !{!"branch_weights", i32 84, i32 16} @@ -1501,3 +1597,6 @@ declare void @effect(i32) !10 = !{!"branch_weights", i32 90, i32 10} !11 = !{!"branch_weights", i32 1, i32 1} !12 = !{!"branch_weights", i32 5, i32 3} +!13 = !{!"branch_weights", i32 1, i32 1} +!14 = !{!"branch_weights", i32 1, i32 1023} +!15 = !{!"branch_weights", i32 4095, i32 1} diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll index a0a1c3646624f..7f7f9791d9038 100644 --- a/test/CodeGen/X86/bool-simplify.ll +++ b/test/CodeGen/X86/bool-simplify.ll @@ -1,45 +1,62 @@ -; RUN: llc < %s -march=x86-64 -mattr=+sse4.1,-avx,+rdrnd,+rdseed | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx,+rdrnd,+rdseed | FileCheck %s define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) { +; CHECK-LABEL: foo: +; CHECK: # BB#0: +; CHECK-NEXT: ptest %xmm0, %xmm0 +; CHECK-NEXT: cmovnel %esi, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c) %t2 = icmp ne i32 %t1, 0 %t3 = select i1 %t2, i32 %a, i32 %b ret i32 %t3 -; CHECK: foo -; CHECK: ptest -; CHECK-NOT: testl -; CHECK: cmov -; CHECK: ret } define i32 @bar(<2 x i64> %c) { +; CHECK-LABEL: bar: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: ptest %xmm0, %xmm0 +; CHECK-NEXT: jne .LBB1_2 +; CHECK-NEXT: # BB#1: # %if-true-block +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB1_2: # %endif-block +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: retq entry: %0 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c) %1 = icmp ne i32 %0, 0 br i1 %1, label %if-true-block, label %endif-block -if-true-block: ; preds = %entry +if-true-block: ret i32 0 -endif-block: ; preds = %entry, +endif-block: ret i32 1 -; CHECK: bar -; CHECK: ptest -; CHECK-NOT: testl -; CHECK: jne -; CHECK: ret } define i32 @bax(<2 x i64> %c) { +; CHECK-LABEL: bax: +; CHECK: # BB#0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: ptest %xmm0, %xmm0 +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c) %t2 = icmp eq i32 %t1, 1 %t3 = zext i1 %t2 to i32 ret i32 %t3 -; CHECK: bax -; CHECK: ptest -; CHECK-NOT: cmpl -; CHECK: ret } -define i16 @rnd16(i16 %arg) nounwind uwtable { +define i16 @rnd16(i16 %arg) nounwind { +; CHECK-LABEL: rnd16: +; CHECK: # BB#0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: rdrandw %cx +; CHECK-NEXT: cmovbw %di, %ax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; CHECK-NEXT: retq %1 = tail call { i16, i32 } @llvm.x86.rdrand.16() nounwind %2 = extractvalue { i16, i32 } %1, 0 %3 = extractvalue { i16, i32 } %1, 1 @@ -47,14 +64,16 @@ define i16 @rnd16(i16 %arg) nounwind uwtable { %5 = select i1 %4, i16 0, i16 %arg %6 = add i16 %5, %2 ret i16 %6 -; CHECK: rnd16 -; CHECK: rdrand -; CHECK: cmov -; CHECK-NOT: cmov -; CHECK: ret } -define i32 @rnd32(i32 %arg) nounwind uwtable { +define i32 @rnd32(i32 %arg) nounwind { +; CHECK-LABEL: rnd32: +; CHECK: # BB#0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: rdrandl %ecx +; CHECK-NEXT: cmovbl %edi, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: retq %1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind %2 = extractvalue { i32, i32 } %1, 0 %3 = extractvalue { i32, i32 } %1, 1 @@ -62,14 +81,16 @@ define i32 @rnd32(i32 %arg) nounwind uwtable { %5 = select i1 %4, i32 0, i32 %arg %6 = add i32 %5, %2 ret i32 %6 -; CHECK: rnd32 -; CHECK: rdrand -; CHECK: cmov -; CHECK-NOT: cmov -; CHECK: ret } -define i64 @rnd64(i64 %arg) nounwind uwtable { +define i64 @rnd64(i64 %arg) nounwind { +; CHECK-LABEL: rnd64: +; CHECK: # BB#0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: rdrandq %rcx +; CHECK-NEXT: cmovbq %rdi, %rax +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: retq %1 = tail call { i64, i32 } @llvm.x86.rdrand.64() nounwind %2 = extractvalue { i64, i32 } %1, 0 %3 = extractvalue { i64, i32 } %1, 1 @@ -77,14 +98,17 @@ define i64 @rnd64(i64 %arg) nounwind uwtable { %5 = select i1 %4, i64 0, i64 %arg %6 = add i64 %5, %2 ret i64 %6 -; CHECK: rnd64 -; CHECK: rdrand -; CHECK: cmov -; CHECK-NOT: cmov -; CHECK: ret } -define i16 @seed16(i16 %arg) nounwind uwtable { +define i16 @seed16(i16 %arg) nounwind { +; CHECK-LABEL: seed16: +; CHECK: # BB#0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: rdseedw %cx +; CHECK-NEXT: cmovbw %di, %ax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> +; CHECK-NEXT: retq %1 = tail call { i16, i32 } @llvm.x86.rdseed.16() nounwind %2 = extractvalue { i16, i32 } %1, 0 %3 = extractvalue { i16, i32 } %1, 1 @@ -92,14 +116,16 @@ define i16 @seed16(i16 %arg) nounwind uwtable { %5 = select i1 %4, i16 0, i16 %arg %6 = add i16 %5, %2 ret i16 %6 -; CHECK: seed16 -; CHECK: rdseed -; CHECK: cmov -; CHECK-NOT: cmov -; CHECK: ret } -define i32 @seed32(i32 %arg) nounwind uwtable { +define i32 @seed32(i32 %arg) nounwind { +; CHECK-LABEL: seed32: +; CHECK: # BB#0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: rdseedl %ecx +; CHECK-NEXT: cmovbl %edi, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: retq %1 = tail call { i32, i32 } @llvm.x86.rdseed.32() nounwind %2 = extractvalue { i32, i32 } %1, 0 %3 = extractvalue { i32, i32 } %1, 1 @@ -107,14 +133,16 @@ define i32 @seed32(i32 %arg) nounwind uwtable { %5 = select i1 %4, i32 0, i32 %arg %6 = add i32 %5, %2 ret i32 %6 -; CHECK: seed32 -; CHECK: rdseed -; CHECK: cmov -; CHECK-NOT: cmov -; CHECK: ret } -define i64 @seed64(i64 %arg) nounwind uwtable { +define i64 @seed64(i64 %arg) nounwind { +; CHECK-LABEL: seed64: +; CHECK: # BB#0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: rdseedq %rcx +; CHECK-NEXT: cmovbq %rdi, %rax +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: retq %1 = tail call { i64, i32 } @llvm.x86.rdseed.64() nounwind %2 = extractvalue { i64, i32 } %1, 0 %3 = extractvalue { i64, i32 } %1, 1 @@ -122,11 +150,6 @@ define i64 @seed64(i64 %arg) nounwind uwtable { %5 = select i1 %4, i64 0, i64 %arg %6 = add i64 %5, %2 ret i64 %6 -; CHECK: seed64 -; CHECK: rdseed -; CHECK: cmov -; CHECK-NOT: cmov -; CHECK: ret } declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll index bbe31c5c2ac58..14bdb3853b031 100644 --- a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -1,13 +1,12 @@ -; NOTE: Assertions have been simpilfied MANUALLY after running utils/update_llc_test_checks.py -; Assertions for constant pools have been added MANUALLY. +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX2 -; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX512 -; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=AVX512BW -check-prefix=AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX2 -check-prefix=AVX2-64 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX512 -check-prefix=AVX512F-64 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=AVX512BW -check-prefix=AVX512 -check-prefix=AVX512BW-64 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX2 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX512 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL32 -check-prefix=AVX512 -check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX-64 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL64 -check-prefix=NO-AVX512BW-64 -check-prefix=AVX2-64 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL64 -check-prefix=NO-AVX512BW-64 -check-prefix=AVX512F-64 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL64 -check-prefix=AVX512F-64 -check-prefix=AVX512BW-64 ;===-----------------------------------------------------------------------------=== ; This test checks the ability to recognize a cross element pattern of @@ -17,20 +16,31 @@ ; <i32 0, i32 1, i32 0, i32 1> => broadcast of the constant vector <i32 0, i32 1> ;===-----------------------------------------------------------------------------=== -; ALL: LCPI0 -; ALL-NEXT: .short 256 # 0x100 - define <16 x i8> @f16xi8_i16(<16 x i8> %a) { +; AVX-LABEL: f16xi8_i16: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f16xi8_i16: ; ALL32: # BB#0: -; ALL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %xmm1 +; ALL32-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256] ; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f16xi8_i16: +; AVX-64: # BB#0: +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f16xi8_i16: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastw {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256] ; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq @@ -40,45 +50,48 @@ define <16 x i8> @f16xi8_i16(<16 x i8> %a) { } -; ALL: .LCPI1 -; ALL-NEXT: .long 50462976 # 0x3020100 - -; AVX: .LCPI1 -; AVX-NEXT .long 50462976 # float 3.82047143E-37 - define <16 x i8> @f16xi8_i32(<16 x i8> %a) { +; AVX-LABEL: f16xi8_i32: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f16xi8_i32: ; ALL32: # BB#0: -; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1 +; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976] ; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f16xi8_i32: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f16xi8_i32: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976] ; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f16xi8_i32: -; AVX: # BB#0: -; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 %res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a %res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1 ret <16 x i8> %res2 } -; ALL64: .LCPI2 -; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100 - -; AVX: .LCPI2 -; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275 - define <16 x i8> @f16xi8_i64(<16 x i8> %a) { +; AVX-LABEL: f16xi8_i64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f16xi8_i64: ; ALL32: # BB#0: ; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] @@ -86,38 +99,56 @@ define <16 x i8> @f16xi8_i64(<16 x i8> %a) { ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f16xi8_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f16xi8_i64: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [506097522914230528,506097522914230528] ; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f16xi8_i64: -; AVX: # BB#0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 %res1 = add <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a %res2 = and <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1 ret <16 x i8> %res2 } -; ALL: .LCPI3 -; ALL-NEXT: .short 256 # 0x100 - define <32 x i8> @f32xi8_i16(<32 x i8> %a) { +; AVX-LABEL: f32xi8_i16: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f32xi8_i16: ; ALL32: # BB#0: -; ALL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm1 +; ALL32-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] ; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f32xi8_i16: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f32xi8_i16: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastw {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] ; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq @@ -127,155 +158,273 @@ define <32 x i8> @f32xi8_i16(<32 x i8> %a) { } -; ALL: .LCPI4 -; ALL-NEXT: .long 50462976 # 0x3020100 - -; AVX: .LCPI4 -; AVX-NEXT: .long 50462976 # float 3.82047143E-37 - define <32 x i8> @f32xi8_i32(<32 x i8> %a) { +; AVX-LABEL: f32xi8_i32: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f32xi8_i32: ; ALL32: # BB#0: -; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1 +; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] ; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f32xi8_i32: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f32xi8_i32: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] ; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f32xi8_i32: -; AVX: # BB#0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm2 -; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1 ret <32 x i8> %res2 } -; ALL64: .LCPI5 -; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100 - -; AVX: .LCPI5 -; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275 - define <32 x i8> @f32xi8_i64(<32 x i8> %a) { +; AVX-LABEL: f32xi8_i64: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f32xi8_i64: ; ALL32: # BB#0: -; ALL32-NEXT: vpbroadcastq {{\.LCPI.*}}, %ymm1 +; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275] ; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f32xi8_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f32xi8_i64: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] ; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f32xi8_i64: -; AVX: # BB#0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1 ret <32 x i8> %res2 } -; ALL: .LCPI6 -; ALL-NEXT: .byte 0 # 0x0 -; ALL-NEXT: .byte 1 # 0x1 -; ALL-NEXT: .byte 2 # 0x2 -; ALL-NEXT: .byte 3 # 0x3 -; ALL-NEXT: .byte 4 # 0x4 -; ALL-NEXT: .byte 5 # 0x5 -; ALL-NEXT: .byte 6 # 0x6 -; ALL-NEXT: .byte 7 # 0x7 -; ALL-NEXT: .byte 8 # 0x8 -; ALL-NEXT: .byte 9 # 0x9 -; ALL-NEXT: .byte 10 # 0xa -; ALL-NEXT: .byte 11 # 0xb -; ALL-NEXT: .byte 12 # 0xc -; ALL-NEXT: .byte 13 # 0xd -; ALL-NEXT: .byte 14 # 0xe -; ALL-NEXT: .byte 15 # 0xf -; ALL-NOT: .byte - define <32 x i8> @f32xi8_i128(<32 x i8> %a) { -; ALL-LABEL: f32xi8_i128: -; ALL: # BB#0: -; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; ALL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX-LABEL: f32xi8_i128: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f32xi8_i128: +; ALL32: # BB#0: +; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; ALL32-NEXT: # ymm1 = mem[0,1,0,1] +; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f32xi8_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f32xi8_i128: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; ALL64-NEXT: # ymm1 = mem[0,1,0,1] +; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq %res1 = add <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a %res2 = and <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1 ret <32 x i8> %res2 } -; ALL: .LCPI7 -; ALL-NEXT: .short 256 # 0x100 - define <64 x i8> @f64xi8_i16(<64 x i8> %a) { +; AVX-LABEL: f64xi8_i16: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; ; NO-AVX512BW-LABEL: f64xi8_i16: ; NO-AVX512BW: # BB#0: -; NO-AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl ; ; AVX512BW-LABEL: f64xi8_i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %zmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl +; +; AVX-64-LABEL: f64xi8_i16: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f64xi8_i16: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f64xi8_i16: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq %res1 = add <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %a %res2 = and <64 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>, %res1 ret <64 x i8> %res2 } -; ALL: .LCPI8 -; ALL-NEXT: .long 50462976 # 0x3020100 - -; AVX: .LCPI8 -; AVX-NEXT: .long 50462976 # float 3.82047143E-37 - define <64 x i8> @f64i8_i32(<64 x i8> %a) { +; AVX-LABEL: f64i8_i32: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; ; NO-AVX512BW-LABEL: f64i8_i32: ; NO-AVX512BW: # BB#0: -; NO-AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl ; ; AVX512BW-LABEL: f64i8_i32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %zmm1 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl ; -; AVX-LABEL: f64i8_i32: +; AVX-64-LABEL: f64i8_i32: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f64i8_i32: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f64i8_i32: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] +; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a + %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1 + ret <64 x i8> %res2 +} + + +define <64 x i8> @f64xi8_i64(<64 x i8> %a) { +; AVX-LABEL: f64xi8_i64: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm3 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -283,43 +432,69 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) { ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 - %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %a - %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>, %res1 - ret <64 x i8> %res2 -} - - -; ALL64: .LCPI9 -; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100 - -; ALL32: .LCPI9 -; ALL32-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275 - -; AVX: .LCPI9 -; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275 - -define <64 x i8> @f64xi8_i64(<64 x i8> %a) { +; AVX-NEXT: retl +; ; NO-AVX512BW-LABEL: f64xi8_i64: ; NO-AVX512BW: # BB#0: -; NO-AVX512BW-NEXT: vpbroadcastq {{.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275] ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl ; ; AVX512BW-LABEL: f64xi8_i64: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpbroadcastq {{.*}}, %zmm1 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275] ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl ; -; AVX-LABEL: f64xi8_i64: +; AVX-64-LABEL: f64xi8_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f64xi8_i64: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f64xi8_i64: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528] +; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a + %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1 + ret <64 x i8> %res2 +} + + +define <64 x i8> @f64xi8_i128(<64 x i8> %a) { +; AVX-LABEL: f64xi8_i128: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -327,143 +502,184 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) { ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 - %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %a - %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %res1 - ret <64 x i8> %res2 -} - - -; ALL: .LCPI10 -; ALL-NEXT: .byte 0 # 0x0 -; ALL-NEXT: .byte 1 # 0x1 -; ALL-NEXT: .byte 2 # 0x2 -; ALL-NEXT: .byte 3 # 0x3 -; ALL-NEXT: .byte 4 # 0x4 -; ALL-NEXT: .byte 5 # 0x5 -; ALL-NEXT: .byte 6 # 0x6 -; ALL-NEXT: .byte 7 # 0x7 -; ALL-NEXT: .byte 8 # 0x8 -; ALL-NEXT: .byte 9 # 0x9 -; ALL-NEXT: .byte 10 # 0xa -; ALL-NEXT: .byte 11 # 0xb -; ALL-NEXT: .byte 12 # 0xc -; ALL-NEXT: .byte 13 # 0xd -; ALL-NEXT: .byte 14 # 0xe -; ALL-NEXT: .byte 15 # 0xf -; ALL-NOT: .byte - -define <64 x i8> @f64xi8_i128(<64 x i8> %a) { +; AVX-NEXT: retl +; ; NO-AVX512BW-LABEL: f64xi8_i128: ; NO-AVX512BW: # BB#0: -; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NO-AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl ; ; AVX512BW-LABEL: f64xi8_i128: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl +; +; AVX-64-LABEL: f64xi8_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f64xi8_i128: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NO-AVX512BW-64-NEXT: # ymm2 = mem[0,1,0,1] +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f64xi8_i128: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %a %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %res1 ret <64 x i8> %res2 } -; AVX512BW: .LCPI11 -; AVX512BW-NEXT: .byte 0 # 0x0 -; AVX512BW-NEXT: .byte 1 # 0x1 -; AVX512BW-NEXT: .byte 2 # 0x2 -; AVX512BW-NEXT: .byte 3 # 0x3 -; AVX512BW-NEXT: .byte 4 # 0x4 -; AVX512BW-NEXT: .byte 5 # 0x5 -; AVX512BW-NEXT: .byte 6 # 0x6 -; AVX512BW-NEXT: .byte 7 # 0x7 -; AVX512BW-NEXT: .byte 8 # 0x8 -; AVX512BW-NEXT: .byte 9 # 0x9 -; AVX512BW-NEXT: .byte 10 # 0xa -; AVX512BW-NEXT: .byte 11 # 0xb -; AVX512BW-NEXT: .byte 12 # 0xc -; AVX512BW-NEXT: .byte 13 # 0xd -; AVX512BW-NEXT: .byte 14 # 0xe -; AVX512BW-NEXT: .byte 15 # 0xf -; AVX512BW-NEXT: .byte 16 # 0x10 -; AVX512BW-NEXT: .byte 17 # 0x11 -; AVX512BW-NEXT: .byte 18 # 0x12 -; AVX512BW-NEXT: .byte 19 # 0x13 -; AVX512BW-NEXT: .byte 20 # 0x14 -; AVX512BW-NEXT: .byte 21 # 0x15 -; AVX512BW-NEXT: .byte 22 # 0x16 -; AVX512BW-NEXT: .byte 23 # 0x17 -; AVX512BW-NEXT: .byte 24 # 0x18 -; AVX512BW-NEXT: .byte 25 # 0x19 -; AVX512BW-NEXT: .byte 26 # 0x1a -; AVX512BW-NEXT: .byte 27 # 0x1b -; AVX512BW-NEXT: .byte 28 # 0x1c -; AVX512BW-NEXT: .byte 29 # 0x1d -; AVX512BW-NEXT: .byte 30 # 0x1e -; AVX512BW-NEXT: .byte 31 # 0x1f -; AVX512BW-NOT: .byte - define <64 x i8> @f64xi8_i256(<64 x i8> %a) { +; AVX-LABEL: f64xi8_i256: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; +; NO-AVX512BW-LABEL: f64xi8_i256: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl +; ; AVX512BW-LABEL: f64xi8_i256: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl +; +; AVX-64-LABEL: f64xi8_i256: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f64xi8_i256: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f64xi8_i256: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %a %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %res1 ret <64 x i8> %res2 } -; ALL: .LCPI12 -; ALL-NEXT: .long 65536 # 0x10000 - -; AVX: .LCPI12 -; AVX-NEXT: .long 65536 # float 9.18354962E-41 - define <8 x i16> @f8xi16_i32(<8 x i16> %a) { +; AVX-LABEL: f8xi16_i32: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f8xi16_i32: ; ALL32: # BB#0: -; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1 +; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536] ; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f8xi16_i32: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] +; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f8xi16_i32: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536] ; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f8xi16_i32: -; AVX: # BB#0: -; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 %res1 = add <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a %res2 = and <8 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1 ret <8 x i16> %res2 } -; ALL64: .LCPI13 -; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000 - -; ALL32: .LCPI13 -; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 - -; AVX: .LCPI13 -; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 - define <8 x i16> @f8xi16_i64(<8 x i16> %a) { +; AVX-LABEL: f8xi16_i64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f8xi16_i64: ; ALL32: # BB#0: ; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] @@ -471,67 +687,66 @@ define <8 x i16> @f8xi16_i64(<8 x i16> %a) { ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f8xi16_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f8xi16_i64: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [844433520132096,844433520132096] ; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f8xi16_i64: -; AVX: # BB#0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 %res1 = add <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a %res2 = and <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1 ret <8 x i16> %res2 } -; ALL: .LCPI14 -; ALL-NEXT: .long 65536 # 0x10000 - -; AVX: .LCPI14 -; AVX-NEXT: .long 65536 # float 9.18354962E-41 - define <16 x i16> @f16xi16_i32(<16 x i16> %a) { -; ALL-LABEL: f16xi16_i32: -; ALL: # BB#0: -; ALL-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1 -; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; ; AVX-LABEL: f16xi16_i32: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm2 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f16xi16_i32: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536] +; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f16xi16_i32: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] +; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f16xi16_i32: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536] +; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq %res1 = add <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a %res2 = and <16 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1 ret <16 x i16> %res2 } -; ALL64: .LCPI15 -; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000 - -; ALL32: .LCPI15 -; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 - -; AVX: .LCPI15 -; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 - define <16 x i16> @f16xi16_i64(<16 x i16> %a) { -; ALL-LABEL: f16xi16_i64: -; ALL: # BB#0: -; ALL-NEXT: vpbroadcastq {{.*}}, %ymm1 -; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; ; AVX-LABEL: f16xi16_i64: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -540,60 +755,154 @@ define <16 x i16> @f16xi16_i64(<16 x i16> %a) { ; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f16xi16_i64: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309] +; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f16xi16_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f16xi16_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [844433520132096,844433520132096,844433520132096,844433520132096] +; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq %res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a %res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1 ret <16 x i16> %res2 } -; ALL: .LCPI16 -; ALL-NEXT: .short 0 # 0x0 -; ALL-NEXT: .short 1 # 0x1 -; ALL-NEXT: .short 2 # 0x2 -; ALL-NEXT: .short 3 # 0x3 -; ALL-NEXT: .short 4 # 0x4 -; ALL-NEXT: .short 5 # 0x5 -; ALL-NEXT: .short 6 # 0x6 -; ALL-NEXT: .short 7 # 0x7 -; ALL-NOT: .short - define <16 x i16> @f16xi16_i128(<16 x i16> %a) { -; ALL-LABEL: f16xi16_i128: -; ALL: # BB#0: -; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX-LABEL: f16xi16_i128: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f16xi16_i128: +; ALL32: # BB#0: +; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; ALL32-NEXT: # ymm1 = mem[0,1,0,1] +; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f16xi16_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f16xi16_i128: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; ALL64-NEXT: # ymm1 = mem[0,1,0,1] +; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq %res1 = add <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a %res2 = and <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1 ret <16 x i16> %res2 } -; ALL: .LCPI17 -; ALL-NEXT: .long 65536 # 0x10000 - -; AVX: .LCPI17 -; AVX-NEXT: .long 65536 # float 9.18354962E-41 - define <32 x i16> @f32xi16_i32(<32 x i16> %a) { +; AVX-LABEL: f32xi16_i32: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; ; NO-AVX512BW-LABEL: f32xi16_i32: ; NO-AVX512BW: # BB#0: -; NO-AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] ; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl ; ; AVX512BW-LABEL: f32xi16_i32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %zmm1 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536] ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl ; -; AVX-LABEL: f32xi16_i32: +; AVX-64-LABEL: f32xi16_i32: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f32xi16_i32: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f32xi16_i32: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536] +; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = add <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a + %res2 = and <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1 + ret <32 x i16> %res2 +} + + +define <32 x i16> @f32xi16_i64(<32 x i16> %a) { +; AVX-LABEL: f32xi16_i64: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm3 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -601,43 +910,69 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) { ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 - %res1 = add <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %a - %res2 = and <32 x i16> <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>, %res1 - ret <32 x i16> %res2 -} - - -; ALL64: .LCPI18 -; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000 - -; ALL32: .LCPI18 -; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 - -; AVX: .LCPI18 -; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 - -define <32 x i16> @f32xi16_i64(<32 x i16> %a) { +; AVX-NEXT: retl +; ; NO-AVX512BW-LABEL: f32xi16_i64: ; NO-AVX512BW: # BB#0: -; NO-AVX512BW-NEXT: vpbroadcastq {{.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309] ; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl ; ; AVX512BW-LABEL: f32xi16_i64: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpbroadcastq {{.*}}, %zmm1 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309] ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl ; -; AVX-LABEL: f32xi16_i64: +; AVX-64-LABEL: f32xi16_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f32xi16_i64: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096] +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f32xi16_i64: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096] +; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a + %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1 + ret <32 x i16> %res2 +} + + +define <32 x i16> @f32xi16_i128(<32 x i16> %a) { +; AVX-LABEL: f32xi16_i128: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -645,87 +980,151 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) { ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 - %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %a - %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3, i16 0, i16 1, i16 2, i16 3>, %res1 - ret <32 x i16> %res2 -} - - -; ALL: .LCPI19 -; ALL-NEXT: .short 0 # 0x0 -; ALL-NEXT: .short 1 # 0x1 -; ALL-NEXT: .short 2 # 0x2 -; ALL-NEXT: .short 3 # 0x3 -; ALL-NEXT: .short 4 # 0x4 -; ALL-NEXT: .short 5 # 0x5 -; ALL-NEXT: .short 6 # 0x6 -; ALL-NEXT: .short 7 # 0x7 -; ALL-NOT: .short - -define <32 x i16> @f32xi16_i128(<32 x i16> %a) { +; AVX-NEXT: retl +; ; NO-AVX512BW-LABEL: f32xi16_i128: ; NO-AVX512BW: # BB#0: -; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; NO-AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 ; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl ; ; AVX512BW-LABEL: f32xi16_i128: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl +; +; AVX-64-LABEL: f32xi16_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f32xi16_i128: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; NO-AVX512BW-64-NEXT: # ymm2 = mem[0,1,0,1] +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f32xi16_i128: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %a %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %res1 ret <32 x i16> %res2 } -; AVX512BW: .LCPI20 -; AVX512BW-NEXT: .short 0 # 0x0 -; AVX512BW-NEXT: .short 1 # 0x1 -; AVX512BW-NEXT: .short 2 # 0x2 -; AVX512BW-NEXT: .short 3 # 0x3 -; AVX512BW-NEXT: .short 4 # 0x4 -; AVX512BW-NEXT: .short 5 # 0x5 -; AVX512BW-NEXT: .short 6 # 0x6 -; AVX512BW-NEXT: .short 7 # 0x7 -; AVX512BW-NEXT: .short 8 # 0x8 -; AVX512BW-NEXT: .short 9 # 0x9 -; AVX512BW-NEXT: .short 10 # 0xa -; AVX512BW-NEXT: .short 11 # 0xb -; AVX512BW-NEXT: .short 12 # 0xc -; AVX512BW-NEXT: .short 13 # 0xd -; AVX512BW-NEXT: .short 14 # 0xe -; AVX512BW-NEXT: .short 15 # 0xf -; AVX512BW-NOT: .short - define <32 x i16> @f32xi16_i256(<32 x i16> %a) { +; AVX-LABEL: f32xi16_i256: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7] +; AVX-NEXT: vpaddw %xmm4, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; +; NO-AVX512BW-LABEL: f32xi16_i256: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: retl +; ; AVX512BW-LABEL: f32xi16_i256: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retl +; +; AVX-64-LABEL: f32xi16_i256: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vpaddw %xmm4, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm4, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; NO-AVX512BW-64-LABEL: f32xi16_i256: +; NO-AVX512BW-64: # BB#0: +; NO-AVX512BW-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-64-NEXT: retq +; +; AVX512BW-64-LABEL: f32xi16_i256: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %res1 ret <32 x i16> %res2 } -; ALL64: .LCPI21 -; ALL64-NEXT: .quad 4294967296 # 0x100000000 - -; ALL32: .LCPI21 -; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 - -; AVX: .LCPI21 -; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 define <4 x i32> @f4xi32_i64(<4 x i32> %a) { +; AVX-LABEL: f4xi32_i64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f4xi32_i64: ; ALL32: # BB#0: ; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] @@ -733,40 +1132,26 @@ define <4 x i32> @f4xi32_i64(<4 x i32> %a) { ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f4xi32_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f4xi32_i64: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967296,4294967296] ; ALL64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f4xi32_i64: -; AVX: # BB#0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 %res1 = add <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %a %res2 = and <4 x i32> <i32 0, i32 1, i32 0, i32 1>, %res1 ret <4 x i32> %res2 } -; ALL64: .LCPI22 -; ALL64-NEXT: .quad 4294967296 # 0x100000000 - -; ALL32: .LCPI22 -; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 - -; AVX: .LCPI22 -; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 - define <8 x i32> @f8xi32_i64(<8 x i32> %a) { -; ALL-LABEL: f8xi32_i64: -; ALL: # BB#0: -; ALL-NEXT: vpbroadcastq {{.*}}, %ymm1 -; ALL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; ; AVX-LABEL: f8xi32_i64: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -775,59 +1160,154 @@ define <8 x i32> @f8xi32_i64(<8 x i32> %a) { ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f8xi32_i64: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314] +; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f8xi32_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f8xi32_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967296,4294967296,4294967296,4294967296] +; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq %res1 = add <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a %res2 = and <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1 ret <8 x i32> %res2 } -; ALL: .LCPI23 -; ALL-NEXT: .long 0 # 0x0 -; ALL-NEXT: .long 1 # 0x1 -; ALL-NEXT: .long 2 # 0x2 -; ALL-NEXT: .long 3 # 0x3 -; ALL-NOT: .long - define <8 x i32> @f8xi32_i128(<8 x i32> %a) { -; ALL-LABEL: f8xi32_i128: -; ALL: # BB#0: -; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; ALL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX-LABEL: f8xi32_i128: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3] +; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f8xi32_i128: +; ALL32: # BB#0: +; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] +; ALL32-NEXT: # ymm1 = mem[0,1,0,1] +; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f8xi32_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3] +; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f8xi32_i128: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] +; ALL64-NEXT: # ymm1 = mem[0,1,0,1] +; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq %res1 = add <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a %res2 = and <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1 ret <8 x i32> %res2 } -; ALL64: .LCPI24 -; ALL64-NEXT: .quad 4294967296 # 0x100000000 - -; ALL32: .LCPI24 -; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 - -; AVX: .LCPI24 -; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 - define <16 x i32> @f16xi32_i64(<16 x i32> %a) { +; AVX-LABEL: f16xi32_i64: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; ; AVX2-LABEL: f16xi32_i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastq {{.*}}, %ymm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314] ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl ; ; AVX512-LABEL: f16xi32_i64: ; AVX512: # BB#0: -; AVX512-NEXT: vpbroadcastq {{.*}}, %zmm1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314] ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retl ; -; AVX-LABEL: f16xi32_i64: +; AVX-64-LABEL: f16xi32_i64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f16xi32_i64: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296] +; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xi32_i64: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296] +; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: retq + %res1 = add <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a + %res2 = and <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1 + ret <16 x i32> %res2 +} + + +define <16 x i32> @f16xi32_i128(<16 x i32> %a) { +; AVX-LABEL: f16xi32_i128: ; AVX: # BB#0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -835,51 +1315,103 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) { ; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1] +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 - %res1 = add <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a - %res2 = and <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1 - ret <16 x i32> %res2 -} - - -; ALL: .LCPI25 -; ALL-NEXT: .long 0 # 0x0 -; ALL-NEXT: .long 1 # 0x1 -; ALL-NEXT: .long 2 # 0x2 -; ALL-NEXT: .long 3 # 0x3 -; ALL-NOT: .long - -define <16 x i32> @f16xi32_i128(<16 x i32> %a) { +; AVX-NEXT: retl +; ; AVX2-LABEL: f16xi32_i128: ; AVX2: # BB#0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl ; ; AVX512-LABEL: f16xi32_i128: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retl +; +; AVX-64-LABEL: f16xi32_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f16xi32_i128: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] +; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xi32_i128: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: retq %res1 = add <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a %res2 = and <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1 ret <16 x i32> %res2 } -; ALL64: .LCPI26 -; ALL64-NEXT: .quad 0 # 0x0 -; ALL64-NEXT: .quad 1 # 0x1 -; ALL64-NOT: .quad - define <4 x i64> @f4xi64_i128(<4 x i64> %a) { +; AVX-LABEL: f4xi64_i128: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f4xi64_i128: +; ALL32: # BB#0: +; ALL32-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] +; ALL32-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f4xi64_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: movl $1, %eax +; AVX-64-NEXT: vmovq %rax, %xmm2 +; AVX-64-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f4xi64_i128: ; ALL64: # BB#0: -; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,0,1] +; ALL64-NEXT: # ymm1 = mem[0,1,0,1] ; ALL64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 ; ALL64-NEXT: retq @@ -889,15 +1421,62 @@ define <4 x i64> @f4xi64_i128(<4 x i64> %a) { } -; ALL64: .LCPI27 -; ALL64-NEXT: .quad 0 # 0x0 -; ALL64-NEXT: .quad 1 # 0x1 -; ALL64-NOT: .quad - define <8 x i64> @f8xi64_i128(<8 x i64> %a) { +; AVX-LABEL: f8xi64_i128: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm4 +; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; +; AVX2-LABEL: f8xi64_i128: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] +; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl +; +; AVX512-LABEL: f8xi64_i128: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0] +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retl +; +; AVX-64-LABEL: f8xi64_i128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: movl $1, %eax +; AVX-64-NEXT: vmovq %rax, %xmm3 +; AVX-64-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,0,1] +; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; ; AVX2-64-LABEL: f8xi64_i128: ; AVX2-64: # BB#0: -; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,0,1] +; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -906,57 +1485,99 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) { ; ; AVX512F-64-LABEL: f8xi64_i128: ; AVX512F-64: # BB#0: -; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1] +; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: retq -; -; AVX512BW-64-LABEL: f8xi64_i128: -; AVX512BW-64: # BB#0: -; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-64-NEXT: retq %res1 = add <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %a %res2 = and <8 x i64> <i64 0, i64 1, i64 0, i64 1, i64 0, i64 1, i64 0, i64 1>, %res1 ret <8 x i64> %res2 } -; ALL64: .LCPI28 -; ALL64-NEXT: .quad 0 # 0x0 -; ALL64-NEXT: .quad 1 # 0x1 -; ALL64-NEXT: .quad 2 # 0x2 -; ALL64-NEXT: .quad 3 # 0x3 -; ALL64-NOT: .quad - define <8 x i64> @f8xi64_i256(<8 x i64> %a) { +; AVX-LABEL: f8xi64_i256: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm4 +; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: retl +; +; AVX2-LABEL: f8xi64_i256: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0] +; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl +; +; AVX512-LABEL: f8xi64_i256: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,0,0,1,0,2,0,3,0] +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retl +; +; AVX-64-LABEL: f8xi64_i256: +; AVX-64: # BB#0: +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3] +; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: movl $1, %eax +; AVX-64-NEXT: vmovq %rax, %xmm4 +; AVX-64-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vpaddq %xmm4, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3] +; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f8xi64_i256: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3] +; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq +; ; AVX512F-64-LABEL: f8xi64_i256: ; AVX512F-64: # BB#0: -; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-64-NEXT: retq -; -; AVX512BW-64-LABEL: f8xi64_i256: -; AVX512BW-64: # BB#0: -; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-64-NEXT: retq %res1 = add <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %a %res2 = and <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>, %res1 ret <8 x i64> %res2 } -; ALL: .LCPI29 -; ALL-NEXT: .quad 4575657222482165760 - -; AVX: .LCPI29 -; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492 - define <4 x float> @f4xf32_f64(<4 x float> %a) { +; AVX-LABEL: f4xf32_f64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f4xf32_f64: ; ALL32: # BB#0: ; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] @@ -964,221 +1585,367 @@ define <4 x float> @f4xf32_f64(<4 x float> %a) { ; ALL32-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f4xf32_f64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-64-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f4xf32_f64: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760] ; ALL64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f4xf32_f64: -; AVX: # BB#0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 %res1 = fadd <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %a %res2 = fdiv <4 x float> <float 2.0, float 1.0, float 2.0, float 1.0>, %res1 ret <4 x float> %res2 } -; ALL64: .LCPI30 -; ALL64-NEXT: .quad 4575657222482165760 # 0x3f80000040000000 - -; ALL32: .LCPI30 -; ALL32-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492 - -; AVX: .LCPI30 -; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492 - define <8 x float> @f8xf32_f64(<8 x float> %a) { -; ALL-LABEL: f8xf32_f64: -; ALL: # BB#0: -; ALL-NEXT: vbroadcastsd {{.*}}, %ymm1 -; ALL-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; ; AVX-LABEL: f8xf32_f64: ; AVX: # BB#0: -; AVX-NEXT: vbroadcastsd {{\.LCPI.*}}, %ymm1 +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f8xf32_f64: +; ALL32: # BB#0: +; ALL32-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492] +; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f8xf32_f64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492] +; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f8xf32_f64: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] +; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; ALL64-NEXT: retq %res1 = fadd <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a %res2 = fdiv <8 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1 ret <8 x float> %res2 } -; ALL: .LCPI31 -; ALL-NEXT: .long 1082130432 # float 4 -; ALL-NEXT: .long 1065353216 # float 1 -; ALL-NEXT: .long 1073741824 # float 2 -; ALL-NEXT: .long 1077936128 # float 3 -; ALL-NOT: .long - define <8 x float> @f8xf32_f128(<8 x float> %a) { -; ALL-LABEL: f8xf32_f128: -; ALL: # BB#0: -; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] -; ALL-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; ; AVX-LABEL: f8xf32_f128: ; AVX: # BB#0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX-NEXT: # ymm1 = mem[0,1,0,1] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f8xf32_f128: +; ALL32: # BB#0: +; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; ALL32-NEXT: # ymm1 = mem[0,1,0,1] +; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f8xf32_f128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f8xf32_f128: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; ALL64-NEXT: # ymm1 = mem[0,1,0,1] +; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; ALL64-NEXT: retq %res1 = fadd <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a %res2 = fdiv <8 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1 ret <8 x float> %res2 } -; ALL64: .LCPI32 -; ALL64-NEXT: .quad 4575657222482165760 # 0x3f80000040000000 - -; ALL32: .LCPI32 -; ALL32-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492 - -; AVX: .LCPI32 -; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492 - define <16 x float> @f16xf32_f64(<16 x float> %a) { +; AVX-LABEL: f16xf32_f64: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492] +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: retl +; ; AVX2-LABEL: f16xf32_f64: ; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastsd {{.*}}, %ymm2 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492] ; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retl ; ; AVX512-LABEL: f16xf32_f64: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastsd {{.*}}, %zmm1 +; AVX512-NEXT: vbroadcastsd {{.*#+}} zmm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492] ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retl ; -; AVX-LABEL: f16xf32_f64: -; AVX: # BB#0: -; AVX-NEXT: vbroadcastsd {{\.LCPI.*}}, %ymm2 -; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-64-LABEL: f16xf32_f64: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492] +; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f16xf32_f64: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] +; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xf32_f64: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] +; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; AVX512F-64-NEXT: retq %res1 = fadd <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a %res2 = fdiv <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1 ret <16 x float> %res2 } -; ALL: .LCPI33 -; ALL-NEXT: .long 1082130432 # float 4 -; ALL-NEXT: .long 1065353216 # float 1 -; ALL-NEXT: .long 1073741824 # float 2 -; ALL-NEXT: .long 1077936128 # float 3 -; ALL-NOT: .long - define <16 x float> @f16xf32_f128(<16 x float> %a) { +; AVX-LABEL: f16xf32_f128: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: retl +; ; AVX2-LABEL: f16xf32_f128: ; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retl ; ; AVX512-LABEL: f16xf32_f128: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retl ; -; AVX-LABEL: f16xf32_f128: -; AVX: # BB#0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-64-LABEL: f16xf32_f128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f16xf32_f128: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xf32_f128: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; AVX512F-64-NEXT: retq %res1 = fadd <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a %res2 = fdiv <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1 ret <16 x float> %res2 } -; AVX512: .LCPI34 -; AVX512-NEXT: .long 1090519040 # float 8 -; AVX512-NEXT: .long 1065353216 # float 1 -; AVX512-NEXT: .long 1073741824 # float 2 -; AVX512-NEXT: .long 1077936128 # float 3 -; AVX512-NEXT: .long 1082130432 # float 4 -; AVX512-NEXT: .long 1084227584 # float 5 -; AVX512-NEXT: .long 1086324736 # float 6 -; AVX512-NEXT: .long 1088421888 # float 7 -; AVX512-NOT: .long - define <16 x float> @f16xf32_f256(<16 x float> %a) { +; AVX-LABEL: f16xf32_f256: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00] +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: retl +; +; AVX2-LABEL: f16xf32_f256: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00] +; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retl +; ; AVX512-LABEL: f16xf32_f256: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retl +; +; AVX-64-LABEL: f16xf32_f256: +; AVX-64: # BB#0: +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00] +; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f16xf32_f256: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00] +; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xf32_f256: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00] +; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; AVX512F-64-NEXT: retq %res1 = fadd <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %a %res2 = fdiv <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %res1 ret <16 x float> %res2 } -; ALL: .LCPI35 -; ALL-NEXT: .quad 4611686018427387904 # double 2 -; ALL-NEXT: .quad 4607182418800017408 # double 1 -; ALL-NOT: .quad - define <4 x double> @f4xf64_f128(<4 x double> %a) { -; ALL-LABEL: f4xf64_f128: -; ALL: # BB#0: -; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] -; ALL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; ALL-NEXT: vdivpd %ymm0, %ymm1, %ymm0 -; ; AVX-LABEL: f4xf64_f128: ; AVX: # BB#0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX-NEXT: # ymm1 = mem[0,1,0,1] ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retl +; +; ALL32-LABEL: f4xf64_f128: +; ALL32: # BB#0: +; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; ALL32-NEXT: # ymm1 = mem[0,1,0,1] +; ALL32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; ALL32-NEXT: retl +; +; AVX-64-LABEL: f4xf64_f128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX-64-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; AVX-64-NEXT: retq +; +; ALL64-LABEL: f4xf64_f128: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; ALL64-NEXT: # ymm1 = mem[0,1,0,1] +; ALL64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; ALL64-NEXT: retq %res1 = fadd <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %a %res2 = fdiv <4 x double> <double 2.0, double 1.0, double 2.0, double 1.0>, %res1 ret <4 x double> %res2 } -; ALL: .LCPI36 -; ALL-NEXT: .quad 4611686018427387904 # double 2 -; ALL-NEXT: .quad 4607182418800017408 # double 1 -; ALL-NOT: .quad - define <8 x double> @f8xf64_f128(<8 x double> %a) { +; AVX-LABEL: f8xf64_f128: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX-NEXT: retl +; ; AVX2-LABEL: f8xf64_f128: ; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retl ; ; AVX512-LABEL: f8xf64_f128: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retl ; -; AVX-LABEL: f8xf64_f128: -; AVX: # BB#0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX-64-LABEL: f8xf64_f128: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f8xf64_f128: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f8xf64_f128: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00] +; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vdivpd %zmm0, %zmm1, %zmm0 +; AVX512F-64-NEXT: retq %res1 = fadd <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %a %res2 = fdiv <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %res1 ret <8 x double> %res2 @@ -1193,11 +1960,57 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) { ; AVX512-NOT: .quad define <8 x double> @f8xf64_f256(<8 x double> %a) { +; AVX-LABEL: f8xf64_f256: +; AVX: # BB#0: +; AVX-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX-NEXT: retl +; +; AVX2-LABEL: f8xf64_f256: +; AVX2: # BB#0: +; AVX2-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retl +; ; AVX512-LABEL: f8xf64_f256: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retl +; +; AVX-64-LABEL: f8xf64_f256: +; AVX-64: # BB#0: +; AVX-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX-64-NEXT: retq +; +; AVX2-64-LABEL: f8xf64_f256: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f8xf64_f256: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00] +; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vdivpd %zmm0, %zmm1, %zmm0 +; AVX512F-64-NEXT: retq %res1 = fadd <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %a %res2 = fdiv <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %res1 ret <8 x double> %res2 @@ -1205,32 +2018,34 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) { -; ALL: .LCPI38 -; ALL-NEXT: .long 4290379776 # 0xffba0000 - -; AVX: .LCPI38 -; AVX-NEXT: .long 4290379776 # float NaN - define <8 x i16> @f8xi16_i32_NaN(<8 x i16> %a) { +; AVX-LABEL: f8xi16_i32_NaN: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retl +; ; ALL32-LABEL: f8xi16_i32_NaN: ; ALL32: # BB#0: -; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1 +; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776] ; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; +; AVX-64-LABEL: f8xi16_i32_NaN: +; AVX-64: # BB#0: +; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] +; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; ; ALL64-LABEL: f8xi16_i32_NaN: ; ALL64: # BB#0: -; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776] ; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL64-NEXT: retq -; -; AVX-LABEL: f8xi16_i32_NaN: -; AVX: # BB#0: -; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 %res1 = add <8 x i16> <i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70>, %a %res2 = and <8 x i16> <i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70, i16 0, i16 -70>, %res1 ret <8 x i16> %res2 diff --git a/test/CodeGen/X86/bswap-wide-int.ll b/test/CodeGen/X86/bswap-wide-int.ll index db48eb80de4b9..858dbf5fd85fe 100644 --- a/test/CodeGen/X86/bswap-wide-int.ll +++ b/test/CodeGen/X86/bswap-wide-int.ll @@ -71,8 +71,8 @@ define i128 @bswap_i128(i128 %a0) nounwind { ; X86-MOVBE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-MOVBE-NEXT: movbel %esi, 12(%eax) ; X86-MOVBE-NEXT: movbel %edi, 8(%eax) -; X86-MOVBE-NEXT: movbel %ecx, 4(%eax) -; X86-MOVBE-NEXT: movbel %edx, (%eax) +; X86-MOVBE-NEXT: movbel %edx, 4(%eax) +; X86-MOVBE-NEXT: movbel %ecx, (%eax) ; X86-MOVBE-NEXT: popl %esi ; X86-MOVBE-NEXT: popl %edi ; X86-MOVBE-NEXT: retl $4 diff --git a/test/CodeGen/X86/build-vector-128.ll b/test/CodeGen/X86/build-vector-128.ll index c73d7654045e4..531c6de5f90cf 100644 --- a/test/CodeGen/X86/build-vector-128.ll +++ b/test/CodeGen/X86/build-vector-128.ll @@ -72,12 +72,10 @@ define <4 x float> @test_buildvector_v4f32(float %a0, float %a1, float %a2, floa } define <2 x i64> @test_buildvector_v2i64(i64 %a0, i64 %a1) { -; SSE2-32-LABEL: test_buildvector_v2i64: -; SSE2-32: # BB#0: -; SSE2-32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-32-NEXT: retl +; SSE-32-LABEL: test_buildvector_v2i64: +; SSE-32: # BB#0: +; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; SSE-32-NEXT: retl ; ; SSE-64-LABEL: test_buildvector_v2i64: ; SSE-64: # BB#0: @@ -86,20 +84,9 @@ define <2 x i64> @test_buildvector_v2i64(i64 %a0, i64 %a1) { ; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-64-NEXT: retq ; -; SSE41-32-LABEL: test_buildvector_v2i64: -; SSE41-32: # BB#0: -; SSE41-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0 -; SSE41-32-NEXT: pinsrd $2, {{[0-9]+}}(%esp), %xmm0 -; SSE41-32-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0 -; SSE41-32-NEXT: retl -; ; AVX-32-LABEL: test_buildvector_v2i64: ; AVX-32: # BB#0: -; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: test_buildvector_v2i64: diff --git a/test/CodeGen/X86/build-vector-256.ll b/test/CodeGen/X86/build-vector-256.ll index 1ced1fc3a3822..942b7779abe63 100644 --- a/test/CodeGen/X86/build-vector-256.ll +++ b/test/CodeGen/X86/build-vector-256.ll @@ -51,18 +51,10 @@ define <8 x float> @test_buildvector_v8f32(float %a0, float %a1, float %a2, floa } define <4 x i64> @test_buildvector_v4i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) { -; AVX1-32-LABEL: test_buildvector_v4i64: -; AVX1-32: # BB#0: -; AVX1-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX1-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX1-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX1-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX1-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX1-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-32-NEXT: retl +; AVX-32-LABEL: test_buildvector_v4i64: +; AVX-32: # BB#0: +; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0 +; AVX-32-NEXT: retl ; ; AVX1-64-LABEL: test_buildvector_v4i64: ; AVX1-64: # BB#0: @@ -75,19 +67,6 @@ define <4 x i64> @test_buildvector_v4i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) { ; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-64-NEXT: retq ; -; AVX2-32-LABEL: test_buildvector_v4i64: -; AVX2-32: # BB#0: -; AVX2-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX2-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX2-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX2-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX2-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX2-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX2-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-32-NEXT: retl -; ; AVX2-64-LABEL: test_buildvector_v4i64: ; AVX2-64: # BB#0: ; AVX2-64-NEXT: vmovq %rcx, %xmm0 diff --git a/test/CodeGen/X86/build-vector-512.ll b/test/CodeGen/X86/build-vector-512.ll index 21737cca93a10..fbfbf2d53c634 100644 --- a/test/CodeGen/X86/build-vector-512.ll +++ b/test/CodeGen/X86/build-vector-512.ll @@ -79,25 +79,7 @@ define <16 x float> @test_buildvector_v16f32(float %a0, float %a1, float %a2, fl define <8 x i64> @test_buildvector_v8i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7) { ; AVX-32-LABEL: test_buildvector_v8i64: ; AVX-32: # BB#0: -; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %zmm0 ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: test_buildvector_v8i64: diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll index 83ab2fac2f167..260535985e2d2 100644 --- a/test/CodeGen/X86/cast-vsel.ll +++ b/test/CodeGen/X86/cast-vsel.ll @@ -148,7 +148,7 @@ define <4 x double> @fpext(<4 x double> %a, <4 x double> %b, <4 x float> %c, <4 ; SSE2-NEXT: andnps %xmm5, %xmm0 ; SSE2-NEXT: orps %xmm4, %xmm0 ; SSE2-NEXT: cvtps2pd %xmm0, %xmm2 -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: cvtps2pd %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll index a6bc5aa321fa5..e2a4368b255a7 100644 --- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -1063,87 +1063,89 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { ; ; AVX1-LABEL: _clearupper32xi8b: ; AVX1: # BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: pushq %r15 ; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: pushq %r12 ; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vpextrq $1, %xmm0, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %r14 +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: movq %rcx, %r8 +; AVX1-NEXT: movq %rcx, %r9 +; AVX1-NEXT: movq %rcx, %r10 +; AVX1-NEXT: movq %rcx, %r11 +; AVX1-NEXT: movq %rcx, %r14 +; AVX1-NEXT: movq %rcx, %r15 ; AVX1-NEXT: vpextrq $1, %xmm0, %rdx -; AVX1-NEXT: movq %rdx, %r8 -; AVX1-NEXT: movq %rdx, %r9 -; AVX1-NEXT: movq %rdx, %r11 -; AVX1-NEXT: movq %rdx, %rsi -; AVX1-NEXT: movq %rdx, %rdi -; AVX1-NEXT: movq %rdx, %rcx +; AVX1-NEXT: movq %rdx, %r12 +; AVX1-NEXT: movq %rdx, %r13 +; AVX1-NEXT: movq %rdx, %rbx ; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: movq %rdx, %rdi +; AVX1-NEXT: movq %rdx, %rsi +; AVX1-NEXT: movq %rdx, %rbp ; AVX1-NEXT: andb $15, %dl ; AVX1-NEXT: movb %dl, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: shrq $56, %rax -; AVX1-NEXT: andb $15, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %r14, %r10 -; AVX1-NEXT: shrq $48, %rcx +; AVX1-NEXT: movq %rcx, %rdx ; AVX1-NEXT: andb $15, %cl ; AVX1-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %r14, %rdx -; AVX1-NEXT: shrq $40, %rdi -; AVX1-NEXT: andb $15, %dil -; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %r14, %rax -; AVX1-NEXT: shrq $32, %rsi +; AVX1-NEXT: shrq $56, %rbp +; AVX1-NEXT: andb $15, %bpl +; AVX1-NEXT: movb %bpl, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $48, %rsi ; AVX1-NEXT: andb $15, %sil ; AVX1-NEXT: movb %sil, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %r14, %rcx -; AVX1-NEXT: shrq $24, %r11 -; AVX1-NEXT: andb $15, %r11b -; AVX1-NEXT: movb %r11b, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %r14, %rsi -; AVX1-NEXT: shrq $16, %r9 -; AVX1-NEXT: andb $15, %r9b -; AVX1-NEXT: movb %r9b, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %r14, %rdi -; AVX1-NEXT: shrq $8, %r8 -; AVX1-NEXT: andb $15, %r8b -; AVX1-NEXT: movb %r8b, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq %r14, %rbx -; AVX1-NEXT: andb $15, %r14b -; AVX1-NEXT: movb %r14b, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: shrq $8, %r10 -; AVX1-NEXT: shrq $16, %rdx -; AVX1-NEXT: shrq $24, %rax -; AVX1-NEXT: shrq $32, %rcx -; AVX1-NEXT: shrq $40, %rsi -; AVX1-NEXT: shrq $48, %rdi -; AVX1-NEXT: shrq $56, %rbx -; AVX1-NEXT: andb $15, %bl -; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $40, %rdi ; AVX1-NEXT: andb $15, %dil ; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: andb $15, %sil -; AVX1-NEXT: movb %sil, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $32, %rax ; AVX1-NEXT: andb $15, %al ; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $24, %rbx +; AVX1-NEXT: andb $15, %bl +; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $16, %r13 +; AVX1-NEXT: andb $15, %r13b +; AVX1-NEXT: movb %r13b, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $8, %r12 +; AVX1-NEXT: andb $15, %r12b +; AVX1-NEXT: movb %r12b, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: shrq $8, %r8 +; AVX1-NEXT: shrq $16, %r9 +; AVX1-NEXT: shrq $24, %r10 +; AVX1-NEXT: shrq $32, %r11 +; AVX1-NEXT: shrq $40, %r14 +; AVX1-NEXT: shrq $48, %r15 +; AVX1-NEXT: shrq $56, %rdx ; AVX1-NEXT: andb $15, %dl ; AVX1-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andb $15, %r15b +; AVX1-NEXT: movb %r15b, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andb $15, %r14b +; AVX1-NEXT: movb %r14b, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andb $15, %r11b +; AVX1-NEXT: movb %r11b, -{{[0-9]+}}(%rsp) ; AVX1-NEXT: andb $15, %r10b ; AVX1-NEXT: movb %r10b, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andb $15, %r9b +; AVX1-NEXT: movb %r9b, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: andb $15, %r8b +; AVX1-NEXT: movb %r8b, -{{[0-9]+}}(%rsp) ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movq %rax, %r8 +; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: movq %rax, %rdx ; AVX1-NEXT: movq %rax, %rsi ; AVX1-NEXT: movq %rax, %rdi +; AVX1-NEXT: movl %eax, %ebp ; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: vmovd %eax, %xmm1 ; AVX1-NEXT: shrl $8, %eax ; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: shrl $24, %ebx -; AVX1-NEXT: vpinsrb $3, %ebx, %xmm1, %xmm1 +; AVX1-NEXT: shrl $16, %ebx +; AVX1-NEXT: vpinsrb $2, %ebx, %xmm1, %xmm1 +; AVX1-NEXT: shrl $24, %ebp +; AVX1-NEXT: vpinsrb $3, %ebp, %xmm1, %xmm1 ; AVX1-NEXT: shrq $32, %rdi ; AVX1-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1 ; AVX1-NEXT: shrq $40, %rsi @@ -1153,8 +1155,8 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { ; AVX1-NEXT: shrq $48, %rdx ; AVX1-NEXT: vpinsrb $6, %edx, %xmm1, %xmm1 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: shrq $56, %r8 -; AVX1-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0 +; AVX1-NEXT: shrq $56, %rcx +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0 ; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $8, %ecx ; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 @@ -1222,92 +1224,98 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { ; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 ; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq ; ; AVX2-LABEL: _clearupper32xi8b: ; AVX2: # BB#0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: pushq %r15 ; AVX2-NEXT: pushq %r14 +; AVX2-NEXT: pushq %r13 +; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vpextrq $1, %xmm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %r14 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: movq %rcx, %r8 +; AVX2-NEXT: movq %rcx, %r9 +; AVX2-NEXT: movq %rcx, %r10 +; AVX2-NEXT: movq %rcx, %r11 +; AVX2-NEXT: movq %rcx, %r14 +; AVX2-NEXT: movq %rcx, %r15 ; AVX2-NEXT: vpextrq $1, %xmm0, %rdx -; AVX2-NEXT: movq %rdx, %r8 -; AVX2-NEXT: movq %rdx, %r9 -; AVX2-NEXT: movq %rdx, %r11 -; AVX2-NEXT: movq %rdx, %rsi -; AVX2-NEXT: movq %rdx, %rdi -; AVX2-NEXT: movq %rdx, %rcx +; AVX2-NEXT: movq %rdx, %r12 +; AVX2-NEXT: movq %rdx, %r13 +; AVX2-NEXT: movq %rdx, %rbx ; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: movq %rdx, %rdi +; AVX2-NEXT: movq %rdx, %rsi +; AVX2-NEXT: movq %rdx, %rbp ; AVX2-NEXT: andb $15, %dl ; AVX2-NEXT: movb %dl, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: shrq $56, %rax -; AVX2-NEXT: andb $15, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r14, %r10 -; AVX2-NEXT: shrq $48, %rcx +; AVX2-NEXT: movq %rcx, %rdx ; AVX2-NEXT: andb $15, %cl ; AVX2-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r14, %rdx -; AVX2-NEXT: shrq $40, %rdi -; AVX2-NEXT: andb $15, %dil -; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r14, %rax -; AVX2-NEXT: shrq $32, %rsi +; AVX2-NEXT: shrq $56, %rbp +; AVX2-NEXT: andb $15, %bpl +; AVX2-NEXT: movb %bpl, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $48, %rsi ; AVX2-NEXT: andb $15, %sil ; AVX2-NEXT: movb %sil, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r14, %rcx -; AVX2-NEXT: shrq $24, %r11 -; AVX2-NEXT: andb $15, %r11b -; AVX2-NEXT: movb %r11b, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r14, %rsi -; AVX2-NEXT: shrq $16, %r9 -; AVX2-NEXT: andb $15, %r9b -; AVX2-NEXT: movb %r9b, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r14, %rdi -; AVX2-NEXT: shrq $8, %r8 -; AVX2-NEXT: andb $15, %r8b -; AVX2-NEXT: movb %r8b, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r14, %rbx -; AVX2-NEXT: andb $15, %r14b -; AVX2-NEXT: movb %r14b, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: shrq $8, %r10 -; AVX2-NEXT: shrq $16, %rdx -; AVX2-NEXT: shrq $24, %rax -; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: shrq $40, %rsi -; AVX2-NEXT: shrq $48, %rdi -; AVX2-NEXT: shrq $56, %rbx -; AVX2-NEXT: andb $15, %bl -; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $40, %rdi ; AVX2-NEXT: andb $15, %dil ; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: andb $15, %sil -; AVX2-NEXT: movb %sil, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $32, %rax ; AVX2-NEXT: andb $15, %al ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $24, %rbx +; AVX2-NEXT: andb $15, %bl +; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $16, %r13 +; AVX2-NEXT: andb $15, %r13b +; AVX2-NEXT: movb %r13b, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $8, %r12 +; AVX2-NEXT: andb $15, %r12b +; AVX2-NEXT: movb %r12b, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: shrq $8, %r8 +; AVX2-NEXT: shrq $16, %r9 +; AVX2-NEXT: shrq $24, %r10 +; AVX2-NEXT: shrq $32, %r11 +; AVX2-NEXT: shrq $40, %r14 +; AVX2-NEXT: shrq $48, %r15 +; AVX2-NEXT: shrq $56, %rdx ; AVX2-NEXT: andb $15, %dl ; AVX2-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andb $15, %r15b +; AVX2-NEXT: movb %r15b, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andb $15, %r14b +; AVX2-NEXT: movb %r14b, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andb $15, %r11b +; AVX2-NEXT: movb %r11b, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: andb $15, %r10b ; AVX2-NEXT: movb %r10b, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andb $15, %r9b +; AVX2-NEXT: movb %r9b, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: andb $15, %r8b +; AVX2-NEXT: movb %r8b, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: movq %rax, %r8 +; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: movq %rax, %rdx ; AVX2-NEXT: movq %rax, %rsi ; AVX2-NEXT: movq %rax, %rdi +; AVX2-NEXT: movl %eax, %ebp ; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: vmovd %eax, %xmm1 ; AVX2-NEXT: shrl $8, %eax ; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX2-NEXT: shrl $16, %ecx -; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 -; AVX2-NEXT: shrl $24, %ebx -; AVX2-NEXT: vpinsrb $3, %ebx, %xmm1, %xmm1 +; AVX2-NEXT: shrl $16, %ebx +; AVX2-NEXT: vpinsrb $2, %ebx, %xmm1, %xmm1 +; AVX2-NEXT: shrl $24, %ebp +; AVX2-NEXT: vpinsrb $3, %ebp, %xmm1, %xmm1 ; AVX2-NEXT: shrq $32, %rdi ; AVX2-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1 ; AVX2-NEXT: shrq $40, %rsi @@ -1317,8 +1325,8 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { ; AVX2-NEXT: shrq $48, %rdx ; AVX2-NEXT: vpinsrb $6, %edx, %xmm1, %xmm1 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: shrq $56, %r8 -; AVX2-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0 +; AVX2-NEXT: shrq $56, %rcx +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0 ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %ecx ; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 @@ -1386,7 +1394,11 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { ; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %r12 +; AVX2-NEXT: popq %r13 ; AVX2-NEXT: popq %r14 +; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq %x4 = bitcast <32 x i8> %0 to <64 x i4> %r0 = insertelement <64 x i4> %x4, i4 zeroinitializer, i32 1 diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll index d901f16e5c73b..fca39bca6c76a 100644 --- a/test/CodeGen/X86/cmov.ll +++ b/test/CodeGen/X86/cmov.ll @@ -1,34 +1,36 @@ -; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-apple-darwin10 -disable-cgp-select2branch | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone { -entry: ; CHECK-LABEL: test1: -; CHECK: btl -; CHECK-NEXT: movl $12, %eax -; CHECK-NEXT: cmovael (%rcx), %eax -; CHECK-NEXT: ret - - %0 = lshr i32 %x, %n ; <i32> [#uses=1] - %1 = and i32 %0, 1 ; <i32> [#uses=1] - %toBool = icmp eq i32 %1, 0 ; <i1> [#uses=1] +; CHECK: # BB#0: # %entry +; CHECK-NEXT: btl %esi, %edi +; CHECK-NEXT: movl $12, %eax +; CHECK-NEXT: cmovael (%rcx), %eax +; CHECK-NEXT: retq +entry: + %0 = lshr i32 %x, %n + %1 = and i32 %0, 1 + %toBool = icmp eq i32 %1, 0 %v = load i32, i32* %vp - %.0 = select i1 %toBool, i32 %v, i32 12 ; <i32> [#uses=1] + %.0 = select i1 %toBool, i32 %v, i32 12 ret i32 %.0 } + define i32 @test2(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone { -entry: ; CHECK-LABEL: test2: -; CHECK: btl -; CHECK-NEXT: movl $12, %eax -; CHECK-NEXT: cmovbl (%rcx), %eax -; CHECK-NEXT: ret - - %0 = lshr i32 %x, %n ; <i32> [#uses=1] - %1 = and i32 %0, 1 ; <i32> [#uses=1] - %toBool = icmp eq i32 %1, 0 ; <i1> [#uses=1] +; CHECK: # BB#0: # %entry +; CHECK-NEXT: btl %esi, %edi +; CHECK-NEXT: movl $12, %eax +; CHECK-NEXT: cmovbl (%rcx), %eax +; CHECK-NEXT: retq +entry: + %0 = lshr i32 %x, %n + %1 = and i32 %0, 1 + %toBool = icmp eq i32 %1, 0 %v = load i32, i32* %vp - %.0 = select i1 %toBool, i32 12, i32 %v ; <i32> [#uses=1] + %.0 = select i1 %toBool, i32 12, i32 %v ret i32 %.0 } @@ -41,10 +43,13 @@ declare void @bar(i64) nounwind define void @test3(i64 %a, i64 %b, i1 %p) nounwind { ; CHECK-LABEL: test3: -; CHECK: cmov{{n?}}el %[[R1:e..]], %[[R2:e..]] -; CHECK-NOT: movl -; CHECK: call - +; CHECK: # BB#0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: cmovel %esi, %edi +; CHECK-NEXT: callq bar +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq %c = trunc i64 %a to i32 %d = trunc i64 %b to i32 %e = select i1 %p, i32 %c, i32 %d @@ -65,52 +70,86 @@ define void @test3(i64 %a, i64 %b, i1 %p) nounwind { ; PR4814 -@g_3 = external global i8 ; <i8*> [#uses=1] -@g_96 = external global i8 ; <i8*> [#uses=2] -@g_100 = external global i8 ; <i8*> [#uses=2] -@_2E_str = external constant [15 x i8], align 1 ; <[15 x i8]*> [#uses=1] +@g_3 = external global i8 +@g_96 = external global i8 +@g_100 = external global i8 +@_2E_str = external constant [15 x i8], align 1 define i1 @test4() nounwind { +; CHECK-LABEL: test4: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movsbl {{.*}}(%rip), %edx +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: shrb $7, %al +; CHECK-NEXT: movzbl %al, %ecx +; CHECK-NEXT: xorl $1, %ecx +; CHECK-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill> +; CHECK-NEXT: sarl %cl, %edx +; CHECK-NEXT: movb {{.*}}(%rip), %al +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je .LBB3_2 +; CHECK-NEXT: # BB#1: # %bb.i.i.i +; CHECK-NEXT: movb {{.*}}(%rip), %cl +; CHECK-NEXT: .LBB3_2: # %func_4.exit.i +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: testb %dl, %dl +; CHECK-NEXT: setne %bl +; CHECK-NEXT: movb %al, %cl +; CHECK-NEXT: je .LBB3_4 +; CHECK-NEXT: # BB#3: # %func_4.exit.i +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: .LBB3_4: # %func_4.exit.i +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je .LBB3_7 +; CHECK-NEXT: # BB#5: # %func_4.exit.i +; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: jne .LBB3_7 +; CHECK-NEXT: # BB#6: # %bb.i.i +; CHECK-NEXT: movb {{.*}}(%rip), %cl +; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: movb %al, %cl +; CHECK-NEXT: .LBB3_7: # %func_1.exit +; CHECK-NEXT: movb %cl, {{.*}}(%rip) +; CHECK-NEXT: movzbl %cl, %esi +; CHECK-NEXT: movl $_2E_str, %edi +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: callq printf +; CHECK-NEXT: movl %ebx, %eax +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq entry: - %0 = load i8, i8* @g_3, align 1 ; <i8> [#uses=2] - %1 = sext i8 %0 to i32 ; <i32> [#uses=1] - %.lobit.i = lshr i8 %0, 7 ; <i8> [#uses=1] - %tmp.i = zext i8 %.lobit.i to i32 ; <i32> [#uses=1] - %tmp.not.i = xor i32 %tmp.i, 1 ; <i32> [#uses=1] - %iftmp.17.0.i.i = ashr i32 %1, %tmp.not.i ; <i32> [#uses=1] - %retval56.i.i = trunc i32 %iftmp.17.0.i.i to i8 ; <i8> [#uses=1] - %2 = icmp eq i8 %retval56.i.i, 0 ; <i1> [#uses=2] - %g_96.promoted.i = load i8, i8* @g_96 ; <i8> [#uses=3] - %3 = icmp eq i8 %g_96.promoted.i, 0 ; <i1> [#uses=2] + %0 = load i8, i8* @g_3, align 1 + %1 = sext i8 %0 to i32 + %.lobit.i = lshr i8 %0, 7 + %tmp.i = zext i8 %.lobit.i to i32 + %tmp.not.i = xor i32 %tmp.i, 1 + %iftmp.17.0.i.i = ashr i32 %1, %tmp.not.i + %retval56.i.i = trunc i32 %iftmp.17.0.i.i to i8 + %2 = icmp eq i8 %retval56.i.i, 0 + %g_96.promoted.i = load i8, i8* @g_96 + %3 = icmp eq i8 %g_96.promoted.i, 0 br i1 %3, label %func_4.exit.i, label %bb.i.i.i -bb.i.i.i: ; preds = %entry - %4 = load volatile i8, i8* @g_100, align 1 ; <i8> [#uses=0] +bb.i.i.i: + %4 = load volatile i8, i8* @g_100, align 1 br label %func_4.exit.i -; CHECK-LABEL: test4: -; CHECK: g_100 -; CHECK: testb -; CHECK-NOT: xor -; CHECK: setne -; CHECK: testb - -func_4.exit.i: ; preds = %bb.i.i.i, %entry - %.not.i = xor i1 %2, true ; <i1> [#uses=1] - %brmerge.i = or i1 %3, %.not.i ; <i1> [#uses=1] - %.mux.i = select i1 %2, i8 %g_96.promoted.i, i8 0 ; <i8> [#uses=1] +func_4.exit.i: + %.not.i = xor i1 %2, true + %brmerge.i = or i1 %3, %.not.i + %.mux.i = select i1 %2, i8 %g_96.promoted.i, i8 0 br i1 %brmerge.i, label %func_1.exit, label %bb.i.i -bb.i.i: ; preds = %func_4.exit.i - %5 = load volatile i8, i8* @g_100, align 1 ; <i8> [#uses=0] +bb.i.i: + %5 = load volatile i8, i8* @g_100, align 1 br label %func_1.exit -func_1.exit: ; preds = %bb.i.i, %func_4.exit.i - %g_96.tmp.0.i = phi i8 [ %g_96.promoted.i, %bb.i.i ], [ %.mux.i, %func_4.exit.i ] ; <i8> [#uses=2] +func_1.exit: + %g_96.tmp.0.i = phi i8 [ %g_96.promoted.i, %bb.i.i ], [ %.mux.i, %func_4.exit.i ] %ret = phi i1 [ 0, %bb.i.i ], [ %.not.i, %func_4.exit.i ] store i8 %g_96.tmp.0.i, i8* @g_96 - %6 = zext i8 %g_96.tmp.0.i to i32 ; <i32> [#uses=1] - %7 = tail call i32 (i8*, ...) @printf(i8* noalias getelementptr ([15 x i8], [15 x i8]* @_2E_str, i64 0, i64 0), i32 %6) nounwind ; <i32> [#uses=0] + %6 = zext i8 %g_96.tmp.0.i to i32 + %7 = tail call i32 (i8*, ...) @printf(i8* noalias getelementptr ([15 x i8], [15 x i8]* @_2E_str, i64 0, i64 0), i32 %6) nounwind ret i1 %ret } @@ -120,29 +159,32 @@ declare i32 @printf(i8* nocapture, ...) nounwind ; Should compile to setcc | -2. ; rdar://6668608 define i32 @test5(i32* nocapture %P) nounwind readonly { -entry: ; CHECK-LABEL: test5: -; CHECK: xorl %eax, %eax -; CHECK: setg %al -; CHECK: orl $-2, %eax -; CHECK: ret - - %0 = load i32, i32* %P, align 4 ; <i32> [#uses=1] - %1 = icmp sgt i32 %0, 41 ; <i1> [#uses=1] - %iftmp.0.0 = select i1 %1, i32 -1, i32 -2 ; <i32> [#uses=1] +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $41, (%rdi) +; CHECK-NEXT: setg %al +; CHECK-NEXT: orl $-2, %eax +; CHECK-NEXT: retq +entry: + %0 = load i32, i32* %P, align 4 + %1 = icmp sgt i32 %0, 41 + %iftmp.0.0 = select i1 %1, i32 -1, i32 -2 ret i32 %iftmp.0.0 } define i32 @test6(i32* nocapture %P) nounwind readonly { -entry: ; CHECK-LABEL: test6: -; CHECK: xorl %eax, %eax -; CHECK: setl %al -; CHECK: leal 4(%rax,%rax,8), %eax -; CHECK: ret - %0 = load i32, i32* %P, align 4 ; <i32> [#uses=1] - %1 = icmp sgt i32 %0, 41 ; <i1> [#uses=1] - %iftmp.0.0 = select i1 %1, i32 4, i32 13 ; <i32> [#uses=1] +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $42, (%rdi) +; CHECK-NEXT: setl %al +; CHECK-NEXT: leal 4(%rax,%rax,8), %eax +; CHECK-NEXT: retq +entry: + %0 = load i32, i32* %P, align 4 + %1 = icmp sgt i32 %0, 41 + %iftmp.0.0 = select i1 %1, i32 4, i32 13 ret i32 %iftmp.0.0 } @@ -151,16 +193,21 @@ entry: ; because it isn't worth it. Just use a branch instead. define i8 @test7(i1 inreg %c, i8 inreg %a, i8 inreg %b) nounwind { ; CHECK-LABEL: test7: -; CHECK: testb $1, %dil -; CHECK-NEXT: jne LBB - +; CHECK: # BB#0: +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: jne .LBB6_2 +; CHECK-NEXT: # BB#1: +; CHECK-NEXT: movb %dl, %sil +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: retq %d = select i1 %c, i8 %a, i8 %b ret i8 %d } define i32 @smin(i32 %x) { ; CHECK-LABEL: smin: -; CHECK: ## BB#0: +; CHECK: # BB#0: ; CHECK-NEXT: xorl $-1, %edi ; CHECK-NEXT: movl $-1, %eax ; CHECK-NEXT: cmovsl %edi, %eax diff --git a/test/CodeGen/X86/code_placement_cold_loop_blocks.ll b/test/CodeGen/X86/code_placement_cold_loop_blocks.ll index d7dc8defac3ae..875d791dc8021 100644 --- a/test/CodeGen/X86/code_placement_cold_loop_blocks.ll +++ b/test/CodeGen/X86/code_placement_cold_loop_blocks.ll @@ -37,7 +37,7 @@ end: ret void } -define void @nested_loop_0() !prof !1 { +define void @nested_loop_0(i1 %flag) !prof !1 { ; Test if a block that is cold in the inner loop but not cold in the outer loop ; will merged to the outer loop chain. ; @@ -68,8 +68,7 @@ if.then: if.else: call void @e() - %call2 = call zeroext i1 @a() - br i1 %call2, label %header2, label %header, !prof !3 + br i1 %flag, label %header2, label %header, !prof !3 end: call void @f() diff --git a/test/CodeGen/X86/combine-avx-intrinsics.ll b/test/CodeGen/X86/combine-avx-intrinsics.ll index 64e081523c1f4..811b1f20833c9 100644 --- a/test/CodeGen/X86/combine-avx-intrinsics.ll +++ b/test/CodeGen/X86/combine-avx-intrinsics.ll @@ -1,59 +1,56 @@ -; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0) { +; CHECK-LABEL: test_x86_avx_blend_pd_256: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a0, i32 7) ret <4 x double> %1 } -; CHECK-LABEL: test_x86_avx_blend_pd_256 -; CHECK-NOT: vblendpd -; CHECK: ret - define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0) { +; CHECK-LABEL: test_x86_avx_blend_ps_256: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a0, i32 7) ret <8 x float> %1 } -; CHECK-LABEL: test_x86_avx_blend_ps_256 -; CHECK-NOT: vblendps -; CHECK: ret - define <4 x double> @test2_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) { +; CHECK-LABEL: test2_x86_avx_blend_pd_256: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0) ret <4 x double> %1 } -; CHECK-LABEL: test2_x86_avx_blend_pd_256 -; CHECK-NOT: vblendpd -; CHECK: ret - define <8 x float> @test2_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) { +; CHECK-LABEL: test2_x86_avx_blend_ps_256: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0) ret <8 x float> %1 } -; CHECK-LABEL: test2_x86_avx_blend_ps_256 -; CHECK-NOT: vblendps -; CHECK: ret - define <4 x double> @test3_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) { +; CHECK-LABEL: test3_x86_avx_blend_pd_256: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq %1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 -1) ret <4 x double> %1 } -; CHECK-LABEL: test3_x86_avx_blend_pd_256 -; CHECK-NOT: vblendpd -; CHECK: ret - define <8 x float> @test3_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) { +; CHECK-LABEL: test3_x86_avx_blend_ps_256: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq %1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 -1) ret <8 x float> %1 } -; CHECK-LABEL: test3_x86_avx_blend_ps_256 -; CHECK-NOT: vblendps -; CHECK: ret - declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) diff --git a/test/CodeGen/X86/combine-avx2-intrinsics.ll b/test/CodeGen/X86/combine-avx2-intrinsics.ll index 2714b26c91414..9a548f6b7f0eb 100644 --- a/test/CodeGen/X86/combine-avx2-intrinsics.ll +++ b/test/CodeGen/X86/combine-avx2-intrinsics.ll @@ -1,88 +1,83 @@ -; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s ; Verify that the backend correctly combines AVX2 builtin intrinsics. define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0) { +; CHECK-LABEL: test_x86_avx2_pblendw: +; CHECK: # BB#0: +; CHECK-NEXT: retq %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a0, i32 7) ret <16 x i16> %res } -; CHECK-LABEL: test_x86_avx2_pblendw -; CHECK-NOT: vpblendw -; CHECK: ret - define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0) { +; CHECK-LABEL: test_x86_avx2_pblendd_128: +; CHECK: # BB#0: +; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a0, i32 7) ret <4 x i32> %res } -; CHECK-LABEL: test_x86_avx2_pblendd_128 -; CHECK-NOT: vpblendd -; CHECK: ret - define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0) { +; CHECK-LABEL: test_x86_avx2_pblendd_256: +; CHECK: # BB#0: +; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a0, i32 7) ret <8 x i32> %res } -; CHECK-LABEL: test_x86_avx2_pblendd_256 -; CHECK-NOT: vpblendd -; CHECK: ret - define <16 x i16> @test2_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) { +; CHECK-LABEL: test2_x86_avx2_pblendw: +; CHECK: # BB#0: +; CHECK-NEXT: retq %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 0) ret <16 x i16> %res } -; CHECK-LABEL: test2_x86_avx2_pblendw -; CHECK-NOT: vpblendw -; CHECK: ret - define <4 x i32> @test2_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: test2_x86_avx2_pblendd_128: +; CHECK: # BB#0: +; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 0) ret <4 x i32> %res } -; CHECK-LABEL: test2_x86_avx2_pblendd_128 -; CHECK-NOT: vpblendd -; CHECK: ret - define <8 x i32> @test2_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) { +; CHECK-LABEL: test2_x86_avx2_pblendd_256: +; CHECK: # BB#0: +; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 0) ret <8 x i32> %res } -; CHECK-LABEL: test2_x86_avx2_pblendd_256 -; CHECK-NOT: vpblendd -; CHECK: ret - define <16 x i16> @test3_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) { +; CHECK-LABEL: test3_x86_avx2_pblendw: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 -1) ret <16 x i16> %res } -; CHECK-LABEL: test3_x86_avx2_pblendw -; CHECK-NOT: vpblendw -; CHECK: ret - define <4 x i32> @test3_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: test3_x86_avx2_pblendd_128: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 -1) ret <4 x i32> %res } -; CHECK-LABEL: test3_x86_avx2_pblendd_128 -; CHECK-NOT: vpblendd -; CHECK: ret - define <8 x i32> @test3_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) { +; CHECK-LABEL: test3_x86_avx2_pblendd_256: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 -1) ret <8 x i32> %res } -; CHECK-LABEL: test3_x86_avx2_pblendd_256 -; CHECK-NOT: vpblendd -; CHECK: ret - declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) diff --git a/test/CodeGen/X86/combine-rotates.ll b/test/CodeGen/X86/combine-rotates.ll new file mode 100644 index 0000000000000..713ee5d0f65a9 --- /dev/null +++ b/test/CodeGen/X86/combine-rotates.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=XOP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512 + +; fold (rot (rot x, c1), c2) -> rot x, c1+c2 +define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) { +; XOP-LABEL: combine_vec_rot_rot: +; XOP: # BB#0: +; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1 +; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1 +; XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: combine_vec_rot_rot: +; AVX512: # BB#0: +; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4> + %2 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28> + %3 = or <4 x i32> %1, %2 + %4 = lshr <4 x i32> %3, <i32 12, i32 13, i32 14, i32 15> + %5 = shl <4 x i32> %3, <i32 20, i32 19, i32 18, i32 17> + %6 = or <4 x i32> %4, %5 + ret <4 x i32> %6 +} + +define <4 x i32> @combine_vec_rot_rot_splat(<4 x i32> %x) { +; XOP-LABEL: combine_vec_rot_rot_splat: +; XOP: # BB#0: +; XOP-NEXT: vprotd $7, %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512-LABEL: combine_vec_rot_rot_splat: +; AVX512: # BB#0: +; AVX512-NEXT: vpsrld $3, %xmm0, %xmm1 +; AVX512-NEXT: vpslld $29, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpsrld $22, %xmm0, %xmm1 +; AVX512-NEXT: vpslld $10, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3> + %2 = shl <4 x i32> %x, <i32 29, i32 29, i32 29, i32 29> + %3 = or <4 x i32> %1, %2 + %4 = lshr <4 x i32> %3, <i32 22, i32 22, i32 22, i32 22> + %5 = shl <4 x i32> %3, <i32 10, i32 10, i32 10, i32 10> + %6 = or <4 x i32> %4, %5 + ret <4 x i32> %6 +} + +define <4 x i32> @combine_vec_rot_rot_splat_zero(<4 x i32> %x) { +; XOP-LABEL: combine_vec_rot_rot_splat_zero: +; XOP: # BB#0: +; XOP-NEXT: retq +; +; AVX512-LABEL: combine_vec_rot_rot_splat_zero: +; AVX512: # BB#0: +; AVX512-NEXT: vpsrld $1, %xmm0, %xmm1 +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpsrld $31, %xmm0, %xmm1 +; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> + %2 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31> + %3 = or <4 x i32> %1, %2 + %4 = lshr <4 x i32> %3, <i32 31, i32 31, i32 31, i32 31> + %5 = shl <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1> + %6 = or <4 x i32> %4, %5 + ret <4 x i32> %6 +} diff --git a/test/CodeGen/X86/combine-sse41-intrinsics.ll b/test/CodeGen/X86/combine-sse41-intrinsics.ll index 1916883c201b6..0c8e7b317ec6f 100644 --- a/test/CodeGen/X86/combine-sse41-intrinsics.ll +++ b/test/CodeGen/X86/combine-sse41-intrinsics.ll @@ -1,89 +1,81 @@ -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=corei7 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s define <2 x double> @test_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) { +; CHECK-LABEL: test_x86_sse41_blend_pd: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 0) ret <2 x double> %1 } -; CHECK-LABEL: test_x86_sse41_blend_pd -; CHECK-NOT: blendpd -; CHECK: ret - define <4 x float> @test_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) { +; CHECK-LABEL: test_x86_sse41_blend_ps: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 0) ret <4 x float> %1 } -; CHECK-LABEL: test_x86_sse41_blend_ps -; CHECK-NOT: blendps -; CHECK: ret - define <8 x i16> @test_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: test_x86_sse41_pblend_w: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 0) ret <8 x i16> %1 } -; CHECK-LABEL: test_x86_sse41_pblend_w -; CHECK-NOT: pblendw -; CHECK: ret - define <2 x double> @test2_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) { +; CHECK-LABEL: test2_x86_sse41_blend_pd: +; CHECK: # BB#0: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 -1) ret <2 x double> %1 } -; CHECK-LABEL: test2_x86_sse41_blend_pd -; CHECK-NOT: blendpd -; CHECK: movaps %xmm1, %xmm0 -; CHECK-NEXT: ret - define <4 x float> @test2_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) { +; CHECK-LABEL: test2_x86_sse41_blend_ps: +; CHECK: # BB#0: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 -1) ret <4 x float> %1 } -; CHECK-LABEL: test2_x86_sse41_blend_ps -; CHECK-NOT: blendps -; CHECK: movaps %xmm1, %xmm0 -; CHECK-NEXT: ret - define <8 x i16> @test2_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: test2_x86_sse41_pblend_w: +; CHECK: # BB#0: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 -1) ret <8 x i16> %1 } -; CHECK-LABEL: test2_x86_sse41_pblend_w -; CHECK-NOT: pblendw -; CHECK: movaps %xmm1, %xmm0 -; CHECK-NEXT: ret - define <2 x double> @test3_x86_sse41_blend_pd(<2 x double> %a0) { +; CHECK-LABEL: test3_x86_sse41_blend_pd: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a0, i32 7) ret <2 x double> %1 } -; CHECK-LABEL: test3_x86_sse41_blend_pd -; CHECK-NOT: blendpd -; CHECK: ret - define <4 x float> @test3_x86_sse41_blend_ps(<4 x float> %a0) { +; CHECK-LABEL: test3_x86_sse41_blend_ps: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a0, i32 7) ret <4 x float> %1 } -; CHECK-LABEL: test3_x86_sse41_blend_ps -; CHECK-NOT: blendps -; CHECK: ret - define <8 x i16> @test3_x86_sse41_pblend_w(<8 x i16> %a0) { +; CHECK-LABEL: test3_x86_sse41_pblend_w: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a0, i32 7) ret <8 x i16> %1 } -; CHECK-LABEL: test3_x86_sse41_pblend_w -; CHECK-NOT: pblendw -; CHECK: ret - declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) diff --git a/test/CodeGen/X86/constant-hoisting-bfi.ll b/test/CodeGen/X86/constant-hoisting-bfi.ll index 83589b7706f75..d73f7163fd87b 100644 --- a/test/CodeGen/X86/constant-hoisting-bfi.ll +++ b/test/CodeGen/X86/constant-hoisting-bfi.ll @@ -4,13 +4,13 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; Check when BFI is enabled for constant hoisting, constant 214748364701 ; will not be hoisted to the func entry. -; CHECK-LABEL: @foo( +; CHECK-LABEL: @test1( ; CHECK: entry: ; CHECK-NOT: bitcast i64 214748364701 to i64 ; CHECK: if.then: ; Function Attrs: norecurse nounwind uwtable -define i64 @foo(i64* nocapture %a) { +define i64 @test1(i64* nocapture %a) { entry: %arrayidx = getelementptr inbounds i64, i64* %a, i64 9 %t0 = load i64, i64* %arrayidx, align 8 @@ -52,7 +52,7 @@ return: ; preds = %if.else5, %if.then, ; in while.body will be hoisted to while.body.preheader. 214748364701 in ; if.then16 and if.else10 will be merged and hoisted to the beginning of ; if.else10 because if.else10 dominates if.then16. -; CHECK-LABEL: @goo( +; CHECK-LABEL: @test2( ; CHECK: entry: ; CHECK-NOT: bitcast i64 214748364701 to i64 ; CHECK: while.body.preheader: @@ -61,7 +61,7 @@ return: ; preds = %if.else5, %if.then, ; CHECK: if.else10: ; CHECK-NEXT: bitcast i64 214748364701 to i64 ; CHECK-NOT: bitcast i64 214748364701 to i64 -define i64 @goo(i64* nocapture %a) { +define i64 @test2(i64* nocapture %a) { entry: %arrayidx = getelementptr inbounds i64, i64* %a, i64 9 %t0 = load i64, i64* %arrayidx, align 8 @@ -113,3 +113,47 @@ return: ; preds = %while.cond.preheade } !0 = !{!"branch_weights", i32 1, i32 2000} + +; 214748364701 will be hoisted to entry block to reduce code size. +; CHECK-LABEL: @test3( +; CHECK: entry: +; CHECK-NEXT: %const = bitcast i64 214748364701 to i64 +define i64 @test3(i64 %t0) { +entry: + %cmp = icmp ult i64 %t0, 56 + br i1 %cmp, label %if.then, label %if.else + +; CHECK: if.then: +; CHECK-NOT: %const = bitcast i64 214748364701 to i64 +if.then: + %add1 = add i64 %t0, 214748364701 + br label %return + +; CHECK: if.else: +; CHECK-NOT: %const = bitcast i64 214748364701 to i64 +if.else: + %add2 = add i64 %t0, 214748364701 + br label %return + +return: + %retval = phi i64 [ %add1, %if.then ], [ %add2, %if.else ] + ret i64 %retval +} + +; 214748364701 will not be hoisted to entry block because it will only +; increase its live range. +; CHECK-LABEL: @test4( +; CHECK: nextblock: +; CHECK-NEXT: %add1 = add i64 %t0, 214748364701 +define i64 @test4(i64 %t0) { +entry: + %cmp = icmp ult i64 %t0, 56 + br label %nextblock + +nextblock: + %add1 = add i64 %t0, 214748364701 + br label %return + +return: + ret i64 %add1 +} diff --git a/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll b/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll index 9dd184c8ab316..88778b317b97a 100644 --- a/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll +++ b/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll @@ -62,4 +62,128 @@ define void @test_memcpy_args(i8** %Storage) { call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %Dst, i8* align 4 %Src, i32 4, i32 4) ret void } +define i8* @test_memmove1(i8* %P, i8* %Q) { + ; CHECK: test_memmove + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 1, i32 1) + ret i8* %P + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $1, %edx + ; CHECK: __llvm_memmove_element_unordered_atomic_1 +} + +define i8* @test_memmove2(i8* %P, i8* %Q) { + ; CHECK: test_memmove2 + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 2, i32 2) + ret i8* %P + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $2, %edx + ; CHECK: __llvm_memmove_element_unordered_atomic_2 +} + +define i8* @test_memmove4(i8* %P, i8* %Q) { + ; CHECK: test_memmove4 + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 4, i32 4) + ret i8* %P + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $4, %edx + ; CHECK: __llvm_memmove_element_unordered_atomic_4 +} + +define i8* @test_memmove8(i8* %P, i8* %Q) { + ; CHECK: test_memmove8 + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %P, i8* align 8 %Q, i32 8, i32 8) + ret i8* %P + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $8, %edx + ; CHECK: __llvm_memmove_element_unordered_atomic_8 +} + +define i8* @test_memmove16(i8* %P, i8* %Q) { + ; CHECK: test_memmove16 + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %P, i8* align 16 %Q, i32 16, i32 16) + ret i8* %P + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $16, %edx + ; CHECK: __llvm_memmove_element_unordered_atomic_16 +} + +define void @test_memmove_args(i8** %Storage) { + ; CHECK: test_memmove_args + %Dst = load i8*, i8** %Storage + %Src.addr = getelementptr i8*, i8** %Storage, i64 1 + %Src = load i8*, i8** %Src.addr + + ; 1st arg (%rdi) + ; CHECK-DAG: movq (%rdi), [[REG1:%r.+]] + ; CHECK-DAG: movq [[REG1]], %rdi + ; 2nd arg (%rsi) + ; CHECK-DAG: movq 8(%rdi), %rsi + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $4, %edx + ; CHECK: __llvm_memmove_element_unordered_atomic_4 + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %Dst, i8* align 4 %Src, i32 4, i32 4) ret void +} + +define i8* @test_memset1(i8* %P, i8 %V) { + ; CHECK: test_memset + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %P, i8 %V, i32 1, i32 1) + ret i8* %P + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $1, %edx + ; CHECK: __llvm_memset_element_unordered_atomic_1 +} + +define i8* @test_memset2(i8* %P, i8 %V) { + ; CHECK: test_memset2 + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %P, i8 %V, i32 2, i32 2) + ret i8* %P + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $2, %edx + ; CHECK: __llvm_memset_element_unordered_atomic_2 +} + +define i8* @test_memset4(i8* %P, i8 %V) { + ; CHECK: test_memset4 + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %P, i8 %V, i32 4, i32 4) + ret i8* %P + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $4, %edx + ; CHECK: __llvm_memset_element_unordered_atomic_4 +} + +define i8* @test_memset8(i8* %P, i8 %V) { + ; CHECK: test_memset8 + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 8 %P, i8 %V, i32 8, i32 8) + ret i8* %P + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $8, %edx + ; CHECK: __llvm_memset_element_unordered_atomic_8 +} + +define i8* @test_memset16(i8* %P, i8 %V) { + ; CHECK: test_memset16 + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 16 %P, i8 %V, i32 16, i32 16) + ret i8* %P + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $16, %edx + ; CHECK: __llvm_memset_element_unordered_atomic_16 +} + +define void @test_memset_args(i8** %Storage, i8* %V) { + ; CHECK: test_memset_args + %Dst = load i8*, i8** %Storage + %Val = load i8, i8* %V + + ; 1st arg (%rdi) + ; CHECK-DAG: movq (%rdi), %rdi + ; 2nd arg (%rsi) + ; CHECK-DAG: movzbl (%rsi), %esi + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $4, %edx + ; CHECK: __llvm_memset_element_unordered_atomic_4 + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %Dst, i8 %Val, i32 4, i32 4) ret void +} + declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind +declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind +declare void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* nocapture, i8, i32, i32) nounwind diff --git a/test/CodeGen/X86/extract-store.ll b/test/CodeGen/X86/extract-store.ll index 48cb8d70b9748..4ea6b7801fb31 100644 --- a/test/CodeGen/X86/extract-store.ll +++ b/test/CodeGen/X86/extract-store.ll @@ -345,7 +345,7 @@ define void @extract_i64_1(i64* nocapture %dst, <2 x i64> %foo) nounwind { ; SSE-X32-LABEL: extract_i64_1: ; SSE-X32: # BB#0: ; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE-X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-X32-NEXT: movq %xmm0, (%eax) ; SSE-X32-NEXT: retl ; diff --git a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll index 5d5cbc76f92ee..4d0b5ccc16b0e 100644 --- a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll +++ b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple i386-apple-darwin -mcpu=yonah | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah | FileCheck %s target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" @@ -6,31 +7,31 @@ target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" ; into loads, off the stack or a previous store. ; Be very explicit about the ordering/stack offsets. -; CHECK-LABEL: test_extractelement_legalization_storereuse: -; CHECK: # BB#0 -; CHECK-NEXT: pushl %ebx -; CHECK-NEXT: pushl %edi -; CHECK-NEXT: pushl %esi -; CHECK-NEXT: movl 16(%esp), %eax -; CHECK-NEXT: movl 24(%esp), %ecx -; CHECK-NEXT: movl 20(%esp), %edx -; CHECK-NEXT: paddd (%edx), %xmm0 -; CHECK-NEXT: movdqa %xmm0, (%edx) -; CHECK-NEXT: movl (%edx), %esi -; CHECK-NEXT: movl 4(%edx), %edi -; CHECK-NEXT: shll $4, %ecx -; CHECK-NEXT: movl 8(%edx), %ebx -; CHECK-NEXT: movl 12(%edx), %edx -; CHECK-NEXT: movl %esi, 12(%eax,%ecx) -; CHECK-NEXT: movl %edi, (%eax,%ecx) -; CHECK-NEXT: movl %ebx, 8(%eax,%ecx) -; CHECK-NEXT: movl %edx, 4(%eax,%ecx) -; CHECK-NEXT: popl %esi -; CHECK-NEXT: popl %edi -; CHECK-NEXT: popl %ebx -; CHECK-NEXT: retl - define void @test_extractelement_legalization_storereuse(<4 x i32> %a, i32* nocapture %x, i32* nocapture readonly %y, i32 %i) #0 { +; CHECK-LABEL: test_extractelement_legalization_storereuse: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: paddd (%ecx), %xmm0 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movdqa %xmm0, (%ecx) +; CHECK-NEXT: movl (%ecx), %esi +; CHECK-NEXT: movl 4(%ecx), %edi +; CHECK-NEXT: shll $4, %edx +; CHECK-NEXT: movl 8(%ecx), %ebx +; CHECK-NEXT: movl 12(%ecx), %ecx +; CHECK-NEXT: movl %esi, 12(%eax,%edx) +; CHECK-NEXT: movl %edi, (%eax,%edx) +; CHECK-NEXT: movl %ebx, 8(%eax,%edx) +; CHECK-NEXT: movl %ecx, 4(%eax,%edx) +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: popl %ebx +; CHECK-NEXT: retl +; CHECK-NEXT: ## -- End function entry: %0 = bitcast i32* %y to <4 x i32>* %1 = load <4 x i32>, <4 x i32>* %0, align 16 diff --git a/test/CodeGen/X86/fast-isel-abort-warm.ll b/test/CodeGen/X86/fast-isel-abort-warm.ll index 3caa91b11ec69..e87d14bb28ade 100644 --- a/test/CodeGen/X86/fast-isel-abort-warm.ll +++ b/test/CodeGen/X86/fast-isel-abort-warm.ll @@ -1,4 +1,4 @@ -; RUN: llc -fast-isel -o - %s -fast-isel-report-on-fallback 2>&1 | FileCheck %s +; RUN: llc -fast-isel -o - %s -fast-isel-report-on-fallback -pass-remarks-missed=isel 2>&1 | FileCheck %s ; Make sure FastISel report a warming when we asked it to do so. ; Note: This test needs to use whatever is not supported by FastISel. ; Thus, this test may fail because inline asm gets supported in FastISel. @@ -6,9 +6,26 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx" +; CHECK: remark: <unknown>:0:0: FastISel missed call: call void asm sideeffect ; CHECK: warning: Instruction selection used fallback path for foo define void @foo(){ entry: call void asm sideeffect "nop", "~{dirflag},~{fpsr},~{flags}"() ret void } + +; CHECK: remark: <unknown>:0:0: FastISel missed: store i128 +; CHECK: warning: Instruction selection used fallback path for test_instruction_fallback +define void @test_instruction_fallback(i128* %ptr){ + %v1 = load i128, i128* %ptr + %result = add i128 %v1, %v1 + store i128 %result, i128 * %ptr + ret void +} + +; CHECK-NOT: remark: <unknown>:0:0: FastISel missed +; CHECK-NOT: warning: Instruction selection used fallback path for test_instruction_not_fallback +define i32 @test_instruction_not_fallback(i32 %a){ + %result = add i32 %a, %a + ret i32 %result +} diff --git a/test/CodeGen/X86/fast-isel-gc-intrinsics.ll b/test/CodeGen/X86/fast-isel-gc-intrinsics.ll new file mode 100644 index 0000000000000..bf08ad01d7d8c --- /dev/null +++ b/test/CodeGen/X86/fast-isel-gc-intrinsics.ll @@ -0,0 +1,57 @@ +; RUN: llc < %s -fast-isel + +target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" +; Dont crash with gc intrinsics. + +; gcrelocate call should not be an LLVM Machine Block by itself. +define i8 addrspace(1)* @test_gcrelocate(i8 addrspace(1)* %v) gc "statepoint-example" { +entry: + %tok = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %v) + %vnew = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok, i32 7, i32 7) + ret i8 addrspace(1)* %vnew +} + +; gcresult calls are fine in their own blocks. +define i1 @test_gcresult() gc "statepoint-example" { +entry: + %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0) + %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token) + ret i1 %call1 +} + +; we are okay here because we see the gcrelocate and avoid generating their own +; block. +define i1 @test_gcresult_gcrelocate(i8 addrspace(1)* %v) gc "statepoint-example" { +entry: + %safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %v) + %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(token %safepoint_token) + %vnew = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %safepoint_token, i32 7, i32 7) + ret i1 %call1 +} + +define i8 addrspace(1)* @test_non_entry_block(i8 addrspace(1)* %v, i8 %val) gc "statepoint-example" { +entry: + %load = load i8, i8 addrspace(1)* %v + %cmp = icmp eq i8 %load, %val + br i1 %cmp, label %func_call, label %exit + +func_call: + call void @dummy() + %tok = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %v) + %vnew = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok, i32 7, i32 7) + ret i8 addrspace(1)* %vnew + +exit: + ret i8 addrspace(1)* %v + +} + +declare void @dummy() +declare void @foo() + +declare zeroext i1 @return_i1() +declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...) +declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...) +declare i1 @llvm.experimental.gc.result.i1(token) +declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32) diff --git a/test/CodeGen/X86/fastisel-softfloat.ll b/test/CodeGen/X86/fastisel-softfloat.ll new file mode 100644 index 0000000000000..e4330db81e1ab --- /dev/null +++ b/test/CodeGen/X86/fastisel-softfloat.ll @@ -0,0 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc %s -o - | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define float @pr26522(float %pat) #0 { +; CHECK-LABEL: pr26522: +; CHECK: # BB#0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq + ret float %pat +} + +attributes #0 = { noinline optnone "target-features"="+soft-float" } diff --git a/test/CodeGen/X86/fp128-i128.ll b/test/CodeGen/X86/fp128-i128.ll index 6c6bc8bdc1d13..98082ec611d49 100644 --- a/test/CodeGen/X86/fp128-i128.ll +++ b/test/CodeGen/X86/fp128-i128.ll @@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64 %n) #0 { ; CHECK-NEXT: andq %rdi, %rcx ; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000 ; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: jmp foo # TAILCALL diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll index c3109673468ec..e09ad3e4e0b85 100644 --- a/test/CodeGen/X86/gather-addresses.ll +++ b/test/CodeGen/X86/gather-addresses.ll @@ -16,10 +16,10 @@ ; LIN: sarq $32, %r[[REG2]] ; LIN: movslq %e[[REG4]], %r[[REG3:.+]] ; LIN: sarq $32, %r[[REG4]] -; LIN: movsd (%rdi,%r[[REG1]],8), %xmm0 -; LIN: movhpd (%rdi,%r[[REG2]],8), %xmm0 -; LIN: movsd (%rdi,%r[[REG3]],8), %xmm1 -; LIN: movhpd (%rdi,%r[[REG4]],8), %xmm1 +; LIN: movsd (%rdi,%r[[REG3]],8), %xmm1 +; LIN: movhpd (%rdi,%r[[REG4]],8), %xmm1 +; LIN: movq %rdi, %xmm1 +; LIN: movq %r[[REG3]], %xmm0 ; WIN: movdqa (%rdx), %xmm0 ; WIN: pand (%r8), %xmm0 @@ -29,10 +29,10 @@ ; WIN: sarq $32, %r[[REG2]] ; WIN: movslq %e[[REG4]], %r[[REG3:.+]] ; WIN: sarq $32, %r[[REG4]] -; WIN: movsd (%rcx,%r[[REG1]],8), %xmm0 -; WIN: movhpd (%rcx,%r[[REG2]],8), %xmm0 -; WIN: movsd (%rcx,%r[[REG3]],8), %xmm1 -; WIN: movhpd (%rcx,%r[[REG4]],8), %xmm1 +; WIN: movsd (%rcx,%r[[REG3]],8), %xmm1 +; WIN: movhpd (%rcx,%r[[REG4]],8), %xmm1 +; WIN: movdqa (%r[[REG2]]), %xmm0 +; WIN: movq %r[[REG2]], %xmm1 define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { %a = load <4 x i32>, <4 x i32>* %i diff --git a/test/CodeGen/X86/half.ll b/test/CodeGen/X86/half.ll index 4c8003f0c516e..b7c43d3b2e3ec 100644 --- a/test/CodeGen/X86/half.ll +++ b/test/CodeGen/X86/half.ll @@ -1,266 +1,833 @@ -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false -fixup-byte-word-insts=1 \ -; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL -check-prefix=BWON -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false -fixup-byte-word-insts=0 \ -; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL -check-prefix=BWOFF -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c -asm-verbose=false -fixup-byte-word-insts=1 \ -; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-F16C -check-prefix=BWON -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr +sse2 -asm-verbose=false -fixup-byte-word-insts=0 \ -; RUN: | FileCheck %s -check-prefix=CHECK-I686 - -define void @test_load_store(half* %in, half* %out) { -; CHECK-LABEL: test_load_store: -; BWON: movzwl (%rdi), %eax -; BWOFF: movw (%rdi), %ax -; CHECK: movw %ax, (%rsi) +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-f16c -fixup-byte-word-insts=1 \ +; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWON,BWON-NOF16C +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-f16c -fixup-byte-word-insts=0 \ +; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWOFF +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+f16c -fixup-byte-word-insts=1 \ +; RUN: | FileCheck %s -check-prefixes=CHECK,BWON,BWON-F16C +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr +sse2 -fixup-byte-word-insts=0 \ +; RUN: | FileCheck %s -check-prefixes=CHECK-I686 + +define void @test_load_store(half* %in, half* %out) #0 { +; BWON-LABEL: test_load_store: +; BWON: # BB#0: +; BWON-NEXT: movzwl (%rdi), %eax +; BWON-NEXT: movw %ax, (%rsi) +; BWON-NEXT: retq +; +; BWOFF-LABEL: test_load_store: +; BWOFF: # BB#0: +; BWOFF-NEXT: movw (%rdi), %ax +; BWOFF-NEXT: movw %ax, (%rsi) +; BWOFF-NEXT: retq +; +; CHECK-I686-LABEL: test_load_store: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-I686-NEXT: movw (%ecx), %cx +; CHECK-I686-NEXT: movw %cx, (%eax) +; CHECK-I686-NEXT: retl %val = load half, half* %in store half %val, half* %out ret void } -define i16 @test_bitcast_from_half(half* %addr) { -; CHECK-LABEL: test_bitcast_from_half: -; BWON: movzwl (%rdi), %eax -; BWOFF: movw (%rdi), %ax +define i16 @test_bitcast_from_half(half* %addr) #0 { +; BWON-LABEL: test_bitcast_from_half: +; BWON: # BB#0: +; BWON-NEXT: movzwl (%rdi), %eax +; BWON-NEXT: retq +; +; BWOFF-LABEL: test_bitcast_from_half: +; BWOFF: # BB#0: +; BWOFF-NEXT: movw (%rdi), %ax +; BWOFF-NEXT: retq +; +; CHECK-I686-LABEL: test_bitcast_from_half: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: movw (%eax), %ax +; CHECK-I686-NEXT: retl %val = load half, half* %addr %val_int = bitcast half %val to i16 ret i16 %val_int } -define void @test_bitcast_to_half(half* %addr, i16 %in) { +define void @test_bitcast_to_half(half* %addr, i16 %in) #0 { ; CHECK-LABEL: test_bitcast_to_half: -; CHECK: movw %si, (%rdi) +; CHECK: # BB#0: +; CHECK-NEXT: movw %si, (%rdi) +; CHECK-NEXT: retq +; +; CHECK-I686-LABEL: test_bitcast_to_half: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: movw {{[0-9]+}}(%esp), %ax +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-I686-NEXT: movw %ax, (%ecx) +; CHECK-I686-NEXT: retl %val_fp = bitcast i16 %in to half store half %val_fp, half* %addr ret void } -define float @test_extend32(half* %addr) { -; CHECK-LABEL: test_extend32: - -; CHECK-LIBCALL: jmp __gnu_h2f_ieee -; CHECK-F16C: vcvtph2ps +define float @test_extend32(half* %addr) #0 { +; CHECK-LIBCALL-LABEL: test_extend32: +; CHECK-LIBCALL: # BB#0: +; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi +; CHECK-LIBCALL-NEXT: jmp __gnu_h2f_ieee # TAILCALL +; +; BWON-F16C-LABEL: test_extend32: +; BWON-F16C: # BB#0: +; BWON-F16C-NEXT: movswl (%rdi), %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm0 +; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: retq +; +; CHECK-I686-LABEL: test_extend32: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: subl $12, %esp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: movzwl (%eax), %eax +; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: addl $12, %esp +; CHECK-I686-NEXT: retl %val16 = load half, half* %addr %val32 = fpext half %val16 to float ret float %val32 } -define double @test_extend64(half* %addr) { -; CHECK-LABEL: test_extend64: - -; CHECK-LIBCALL: callq __gnu_h2f_ieee -; CHECK-LIBCALL: cvtss2sd -; CHECK-F16C: vcvtph2ps -; CHECK-F16C: vcvtss2sd +define double @test_extend64(half* %addr) #0 { +; CHECK-LIBCALL-LABEL: test_extend64: +; CHECK-LIBCALL: # BB#0: +; CHECK-LIBCALL-NEXT: pushq %rax +; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 +; CHECK-LIBCALL-NEXT: popq %rax +; CHECK-LIBCALL-NEXT: retq +; +; BWON-F16C-LABEL: test_extend64: +; BWON-F16C: # BB#0: +; BWON-F16C-NEXT: movswl (%rdi), %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm0 +; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; BWON-F16C-NEXT: retq +; +; CHECK-I686-LABEL: test_extend64: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: subl $12, %esp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: movzwl (%eax), %eax +; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: addl $12, %esp +; CHECK-I686-NEXT: retl %val16 = load half, half* %addr %val32 = fpext half %val16 to double ret double %val32 } -define void @test_trunc32(float %in, half* %addr) { -; CHECK-LABEL: test_trunc32: - -; CHECK-LIBCALL: callq __gnu_f2h_ieee -; CHECK-F16C: vcvtps2ph +define void @test_trunc32(float %in, half* %addr) #0 { +; CHECK-LIBCALL-LABEL: test_trunc32: +; CHECK-LIBCALL: # BB#0: +; CHECK-LIBCALL-NEXT: pushq %rbx +; CHECK-LIBCALL-NEXT: movq %rdi, %rbx +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee +; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) +; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: retq +; +; BWON-F16C-LABEL: test_trunc32: +; BWON-F16C: # BB#0: +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: vmovd %xmm0, %eax +; BWON-F16C-NEXT: movw %ax, (%rdi) +; BWON-F16C-NEXT: retq +; +; CHECK-I686-LABEL: test_trunc32: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: pushl %esi +; CHECK-I686-NEXT: subl $8, %esp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: movss %xmm0, (%esp) +; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: movw %ax, (%esi) +; CHECK-I686-NEXT: addl $8, %esp +; CHECK-I686-NEXT: popl %esi +; CHECK-I686-NEXT: retl %val16 = fptrunc float %in to half store half %val16, half* %addr ret void } -define void @test_trunc64(double %in, half* %addr) { +define void @test_trunc64(double %in, half* %addr) #0 { ; CHECK-LABEL: test_trunc64: - -; CHECK-LIBCALL: callq __truncdfhf2 -; CHECK-F16C: callq __truncdfhf2 +; CHECK: # BB#0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: callq __truncdfhf2 +; CHECK-NEXT: movw %ax, (%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq +; +; CHECK-I686-LABEL: test_trunc64: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: pushl %esi +; CHECK-I686-NEXT: subl $8, %esp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-I686-NEXT: movsd %xmm0, (%esp) +; CHECK-I686-NEXT: calll __truncdfhf2 +; CHECK-I686-NEXT: movw %ax, (%esi) +; CHECK-I686-NEXT: addl $8, %esp +; CHECK-I686-NEXT: popl %esi +; CHECK-I686-NEXT: retl %val16 = fptrunc double %in to half store half %val16, half* %addr ret void } define i64 @test_fptosi_i64(half* %p) #0 { -; CHECK-LABEL: test_fptosi_i64: - -; CHECK-LIBCALL-NEXT: pushq %rax -; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax -; CHECK-LIBCALL-NEXT: popq %rcx -; CHECK-LIBCALL-NEXT: retq - -; CHECK-F16C-NEXT: movswl (%rdi), [[REG0:%[a-z0-9]+]] -; CHECK-F16C-NEXT: vmovd [[REG0]], [[REG1:%[a-z0-9]+]] -; CHECK-F16C-NEXT: vcvtph2ps [[REG1]], [[REG2:%[a-z0-9]+]] -; CHECK-F16C-NEXT: vcvttss2si [[REG2]], %rax -; CHECK-F16C-NEXT: retq +; CHECK-LIBCALL-LABEL: test_fptosi_i64: +; CHECK-LIBCALL: # BB#0: +; CHECK-LIBCALL-NEXT: pushq %rax +; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax +; CHECK-LIBCALL-NEXT: popq %rcx +; CHECK-LIBCALL-NEXT: retq +; +; BWON-F16C-LABEL: test_fptosi_i64: +; BWON-F16C: # BB#0: +; BWON-F16C-NEXT: movswl (%rdi), %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm0 +; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvttss2si %xmm0, %rax +; BWON-F16C-NEXT: retq +; +; CHECK-I686-LABEL: test_fptosi_i64: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: subl $12, %esp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: movzwl (%eax), %eax +; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: fstps (%esp) +; CHECK-I686-NEXT: calll __fixsfdi +; CHECK-I686-NEXT: addl $12, %esp +; CHECK-I686-NEXT: retl %a = load half, half* %p, align 2 %r = fptosi half %a to i64 ret i64 %r } define void @test_sitofp_i64(i64 %a, half* %p) #0 { -; CHECK-LABEL: test_sitofp_i64: - -; CHECK-LIBCALL-NEXT: pushq [[ADDR:%[a-z]+]] -; CHECK-LIBCALL-NEXT: movq %rsi, [[ADDR]] -; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0 -; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee -; CHECK-LIBCALL-NEXT: movw %ax, ([[ADDR]]) -; CHECK_LIBCALL-NEXT: popq [[ADDR]] -; CHECK_LIBCALL-NEXT: retq - -; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG0:%[a-z0-9]+]], [[REG0]] -; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG0]], [[REG0]] -; CHECK-F16C-NEXT: vmovd [[REG0]], %eax -; CHECK-F16C-NEXT: movw %ax, (%rsi) -; CHECK-F16C-NEXT: retq +; CHECK-LIBCALL-LABEL: test_sitofp_i64: +; CHECK-LIBCALL: # BB#0: +; CHECK-LIBCALL-NEXT: pushq %rbx +; CHECK-LIBCALL-NEXT: movq %rsi, %rbx +; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0 +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee +; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) +; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: retq +; +; BWON-F16C-LABEL: test_sitofp_i64: +; BWON-F16C: # BB#0: +; BWON-F16C-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: vmovd %xmm0, %eax +; BWON-F16C-NEXT: movw %ax, (%rsi) +; BWON-F16C-NEXT: retq +; +; CHECK-I686-LABEL: test_sitofp_i64: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: pushl %esi +; CHECK-I686-NEXT: subl $24, %esp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-I686-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fildll {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: movss %xmm0, (%esp) +; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: movw %ax, (%esi) +; CHECK-I686-NEXT: addl $24, %esp +; CHECK-I686-NEXT: popl %esi +; CHECK-I686-NEXT: retl %r = sitofp i64 %a to half store half %r, half* %p ret void } define i64 @test_fptoui_i64(half* %p) #0 { -; CHECK-LABEL: test_fptoui_i64: - -; FP_TO_UINT is expanded using FP_TO_SINT -; CHECK-LIBCALL-NEXT: pushq %rax -; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: movss {{.[A-Z_0-9]+}}(%rip), [[REG1:%[a-z0-9]+]] -; CHECK-LIBCALL-NEXT: movaps %xmm0, [[REG2:%[a-z0-9]+]] -; CHECK-LIBCALL-NEXT: subss [[REG1]], [[REG2]] -; CHECK-LIBCALL-NEXT: cvttss2si [[REG2]], [[REG3:%[a-z0-9]+]] -; CHECK-LIBCALL-NEXT: movabsq $-9223372036854775808, [[REG4:%[a-z0-9]+]] -; CHECK-LIBCALL-NEXT: xorq [[REG3]], [[REG4]] -; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, [[REG5:%[a-z0-9]+]] -; CHECK-LIBCALL-NEXT: ucomiss [[REG1]], %xmm0 -; CHECK-LIBCALL-NEXT: cmovaeq [[REG4]], [[REG5]] -; CHECK-LIBCALL-NEXT: popq %rcx -; CHECK-LIBCALL-NEXT: retq - -; CHECK-F16C-NEXT: movswl (%rdi), [[REG0:%[a-z0-9]+]] -; CHECK-F16C-NEXT: vmovd [[REG0]], [[REG1:%[a-z0-9]+]] -; CHECK-F16C-NEXT: vcvtph2ps [[REG1]], [[REG2:%[a-z0-9]+]] -; CHECK-F16C-NEXT: vmovss {{.[A-Z_0-9]+}}(%rip), [[REG3:%[a-z0-9]+]] -; CHECK-F16C-NEXT: vsubss [[REG3]], [[REG2]], [[REG4:%[a-z0-9]+]] -; CHECK-F16C-NEXT: vcvttss2si [[REG4]], [[REG5:%[a-z0-9]+]] -; CHECK-F16C-NEXT: movabsq $-9223372036854775808, [[REG6:%[a-z0-9]+]] -; CHECK-F16C-NEXT: xorq [[REG5]], [[REG6:%[a-z0-9]+]] -; CHECK-F16C-NEXT: vcvttss2si [[REG2]], [[REG7:%[a-z0-9]+]] -; CHECK-F16C-NEXT: vucomiss [[REG3]], [[REG2]] -; CHECK-F16C-NEXT: cmovaeq [[REG6]], %rax -; CHECK-F16C-NEXT: retq +; CHECK-LIBCALL-LABEL: test_fptoui_i64: +; CHECK-LIBCALL: # BB#0: +; CHECK-LIBCALL-NEXT: pushq %rax +; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-LIBCALL-NEXT: movaps %xmm0, %xmm2 +; CHECK-LIBCALL-NEXT: subss %xmm1, %xmm2 +; CHECK-LIBCALL-NEXT: cvttss2si %xmm2, %rax +; CHECK-LIBCALL-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; CHECK-LIBCALL-NEXT: xorq %rax, %rcx +; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax +; CHECK-LIBCALL-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-LIBCALL-NEXT: cmovaeq %rcx, %rax +; CHECK-LIBCALL-NEXT: popq %rcx +; CHECK-LIBCALL-NEXT: retq +; +; BWON-F16C-LABEL: test_fptoui_i64: +; BWON-F16C: # BB#0: +; BWON-F16C-NEXT: movswl (%rdi), %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm0 +; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; BWON-F16C-NEXT: vsubss %xmm1, %xmm0, %xmm2 +; BWON-F16C-NEXT: vcvttss2si %xmm2, %rax +; BWON-F16C-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; BWON-F16C-NEXT: xorq %rax, %rcx +; BWON-F16C-NEXT: vcvttss2si %xmm0, %rax +; BWON-F16C-NEXT: vucomiss %xmm1, %xmm0 +; BWON-F16C-NEXT: cmovaeq %rcx, %rax +; BWON-F16C-NEXT: retq +; +; CHECK-I686-LABEL: test_fptoui_i64: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: subl $12, %esp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: movzwl (%eax), %eax +; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: fstps (%esp) +; CHECK-I686-NEXT: calll __fixunssfdi +; CHECK-I686-NEXT: addl $12, %esp +; CHECK-I686-NEXT: retl %a = load half, half* %p, align 2 %r = fptoui half %a to i64 ret i64 %r } define void @test_uitofp_i64(i64 %a, half* %p) #0 { -; CHECK-LABEL: test_uitofp_i64: -; CHECK-LIBCALL-NEXT: pushq [[ADDR:%[a-z0-9]+]] -; CHECK-LIBCALL-NEXT: movq %rsi, [[ADDR]] -; CHECK-NEXT: testq %rdi, %rdi -; CHECK-NEXT: js [[LABEL1:.LBB[0-9_]+]] - -; simple conversion to float if non-negative -; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, [[REG1:%[a-z0-9]+]] -; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG1:%[a-z0-9]+]], [[REG1]] -; CHECK-NEXT: jmp [[LABEL2:.LBB[0-9_]+]] - -; convert using shift+or if negative -; CHECK-NEXT: [[LABEL1]]: -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: shrq %rax -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: orq %rax, [[REG2:%[a-z0-9]+]] -; CHECK-LIBCALL-NEXT: cvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]] -; CHECK-LIBCALL-NEXT: addss [[REG3]], [[REG1]] -; CHECK-F16C-NEXT: vcvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]], [[REG3]] -; CHECK-F16C-NEXT: vaddss [[REG3]], [[REG3]], [[REG1:[%a-z0-9]+]] - -; convert float to half -; CHECK-NEXT: [[LABEL2]]: -; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee -; CHECK-LIBCALL-NEXT: movw %ax, ([[ADDR]]) -; CHECK-LIBCALL-NEXT: popq [[ADDR]] -; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG1]], [[REG4:%[a-z0-9]+]] -; CHECK-F16C-NEXT: vmovd [[REG4]], %eax -; CHECK-F16C-NEXT: movw %ax, (%rsi) -; CHECK-NEXT: retq - +; CHECK-LIBCALL-LABEL: test_uitofp_i64: +; CHECK-LIBCALL: # BB#0: +; CHECK-LIBCALL-NEXT: pushq %rbx +; CHECK-LIBCALL-NEXT: movq %rsi, %rbx +; CHECK-LIBCALL-NEXT: testq %rdi, %rdi +; CHECK-LIBCALL-NEXT: js .LBB10_1 +; CHECK-LIBCALL-NEXT: # BB#2: +; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0 +; CHECK-LIBCALL-NEXT: jmp .LBB10_3 +; CHECK-LIBCALL-NEXT: .LBB10_1: +; CHECK-LIBCALL-NEXT: movq %rdi, %rax +; CHECK-LIBCALL-NEXT: shrq %rax +; CHECK-LIBCALL-NEXT: andl $1, %edi +; CHECK-LIBCALL-NEXT: orq %rax, %rdi +; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0 +; CHECK-LIBCALL-NEXT: addss %xmm0, %xmm0 +; CHECK-LIBCALL-NEXT: .LBB10_3: +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee +; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) +; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: retq +; +; BWON-F16C-LABEL: test_uitofp_i64: +; BWON-F16C: # BB#0: +; BWON-F16C-NEXT: testq %rdi, %rdi +; BWON-F16C-NEXT: js .LBB10_1 +; BWON-F16C-NEXT: # BB#2: +; BWON-F16C-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 +; BWON-F16C-NEXT: jmp .LBB10_3 +; BWON-F16C-NEXT: .LBB10_1: +; BWON-F16C-NEXT: movq %rdi, %rax +; BWON-F16C-NEXT: shrq %rax +; BWON-F16C-NEXT: andl $1, %edi +; BWON-F16C-NEXT: orq %rax, %rdi +; BWON-F16C-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 +; BWON-F16C-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; BWON-F16C-NEXT: .LBB10_3: +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: vmovd %xmm0, %eax +; BWON-F16C-NEXT: movw %ax, (%rsi) +; BWON-F16C-NEXT: retq +; +; CHECK-I686-LABEL: test_uitofp_i64: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: pushl %esi +; CHECK-I686-NEXT: subl $24, %esp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-I686-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: xorl %eax, %eax +; CHECK-I686-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: setns %al +; CHECK-I686-NEXT: fildll {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fadds {{\.LCPI.*}}(,%eax,4) +; CHECK-I686-NEXT: fstps (%esp) +; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: movw %ax, (%esi) +; CHECK-I686-NEXT: addl $24, %esp +; CHECK-I686-NEXT: popl %esi +; CHECK-I686-NEXT: retl %r = uitofp i64 %a to half store half %r, half* %p ret void } define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 { -; CHECK-LABEL: test_extend32_vec4: - -; CHECK-LIBCALL: callq __gnu_h2f_ieee -; CHECK-LIBCALL: callq __gnu_h2f_ieee -; CHECK-LIBCALL: callq __gnu_h2f_ieee -; CHECK-LIBCALL: callq __gnu_h2f_ieee -; CHECK-F16C: vcvtph2ps -; CHECK-F16C: vcvtph2ps -; CHECK-F16C: vcvtph2ps -; CHECK-F16C: vcvtph2ps +; CHECK-LIBCALL-LABEL: test_extend32_vec4: +; CHECK-LIBCALL: # BB#0: +; CHECK-LIBCALL-NEXT: pushq %rbx +; CHECK-LIBCALL-NEXT: subq $48, %rsp +; CHECK-LIBCALL-NEXT: movq %rdi, %rbx +; CHECK-LIBCALL-NEXT: movzwl (%rbx), %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movzwl 2(%rbx), %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movzwl 4(%rbx), %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movzwl 6(%rbx), %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-LIBCALL-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-LIBCALL-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: unpcklps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-LIBCALL-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-LIBCALL-NEXT: addq $48, %rsp +; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: retq +; +; BWON-F16C-LABEL: test_extend32_vec4: +; BWON-F16C: # BB#0: +; BWON-F16C-NEXT: movswl 6(%rdi), %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm0 +; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: movswl 4(%rdi), %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm1 +; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 +; BWON-F16C-NEXT: movswl (%rdi), %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm2 +; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 +; BWON-F16C-NEXT: movswl 2(%rdi), %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm3 +; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3 +; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] +; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; BWON-F16C-NEXT: retq +; +; CHECK-I686-LABEL: test_extend32_vec4: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: pushl %esi +; CHECK-I686-NEXT: subl $56, %esp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-I686-NEXT: movzwl 2(%esi), %eax +; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill +; CHECK-I686-NEXT: movzwl 4(%esi), %eax +; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill +; CHECK-I686-NEXT: movzwl 6(%esi), %eax +; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: movzwl (%esi), %eax +; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload +; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload +; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-I686-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-I686-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-I686-NEXT: addl $56, %esp +; CHECK-I686-NEXT: popl %esi +; CHECK-I686-NEXT: retl %a = load <4 x half>, <4 x half>* %p, align 8 %b = fpext <4 x half> %a to <4 x float> ret <4 x float> %b } define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 { -; CHECK-LABEL: test_extend64_vec4 - -; CHECK-LIBCALL: callq __gnu_h2f_ieee -; CHECK-LIBCALL-DAG: callq __gnu_h2f_ieee -; CHECK-LIBCALL-DAG: callq __gnu_h2f_ieee -; CHECK-LIBCALL-DAG: callq __gnu_h2f_ieee -; CHECK-LIBCALL-DAG: cvtss2sd -; CHECK-LIBCALL-DAG: cvtss2sd -; CHECK-LIBCALL-DAG: cvtss2sd -; CHECK-LIBCALL: cvtss2sd -; CHECK-F16C: vcvtph2ps -; CHECK-F16C-DAG: vcvtph2ps -; CHECK-F16C-DAG: vcvtph2ps -; CHECK-F16C-DAG: vcvtph2ps -; CHECK-F16C-DAG: vcvtss2sd -; CHECK-F16C-DAG: vcvtss2sd -; CHECK-F16C-DAG: vcvtss2sd -; CHECK-F16C: vcvtss2sd +; CHECK-LIBCALL-LABEL: test_extend64_vec4: +; CHECK-LIBCALL: # BB#0: +; CHECK-LIBCALL-NEXT: pushq %rbx +; CHECK-LIBCALL-NEXT: subq $16, %rsp +; CHECK-LIBCALL-NEXT: movq %rdi, %rbx +; CHECK-LIBCALL-NEXT: movzwl 4(%rbx), %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; CHECK-LIBCALL-NEXT: movzwl 6(%rbx), %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; CHECK-LIBCALL-NEXT: movzwl (%rbx), %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; CHECK-LIBCALL-NEXT: movzwl 2(%rbx), %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm1 +; CHECK-LIBCALL-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload +; CHECK-LIBCALL-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 +; CHECK-LIBCALL-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-LIBCALL-NEXT: movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload +; CHECK-LIBCALL-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-LIBCALL-NEXT: cvtss2sd %xmm1, %xmm2 +; CHECK-LIBCALL-NEXT: movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload +; CHECK-LIBCALL-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-LIBCALL-NEXT: cvtss2sd %xmm1, %xmm1 +; CHECK-LIBCALL-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-LIBCALL-NEXT: addq $16, %rsp +; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: retq +; +; BWON-F16C-LABEL: test_extend64_vec4: +; BWON-F16C: # BB#0: +; BWON-F16C-NEXT: movswl (%rdi), %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm0 +; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: movswl 2(%rdi), %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm1 +; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 +; BWON-F16C-NEXT: movswl 4(%rdi), %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm2 +; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 +; BWON-F16C-NEXT: movswl 6(%rdi), %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm3 +; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3 +; BWON-F16C-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 +; BWON-F16C-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 +; BWON-F16C-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; BWON-F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; BWON-F16C-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; BWON-F16C-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; BWON-F16C-NEXT: retq +; +; CHECK-I686-LABEL: test_extend64_vec4: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: pushl %esi +; CHECK-I686-NEXT: subl $88, %esp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-I686-NEXT: movzwl 6(%esi), %eax +; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill +; CHECK-I686-NEXT: movzwl 4(%esi), %eax +; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill +; CHECK-I686-NEXT: movzwl 2(%esi), %eax +; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill +; CHECK-I686-NEXT: movzwl (%esi), %eax +; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload +; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload +; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload +; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-I686-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-I686-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-I686-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; CHECK-I686-NEXT: addl $88, %esp +; CHECK-I686-NEXT: popl %esi +; CHECK-I686-NEXT: retl %a = load <4 x half>, <4 x half>* %p, align 8 %b = fpext <4 x half> %a to <4 x double> ret <4 x double> %b } -define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) { -; CHECK-LABEL: test_trunc32_vec4: - -; CHECK-LIBCALL: callq __gnu_f2h_ieee -; CHECK-LIBCALL: callq __gnu_f2h_ieee -; CHECK-LIBCALL: callq __gnu_f2h_ieee -; CHECK-LIBCALL: callq __gnu_f2h_ieee -; CHECK-F16C: vcvtps2ph -; CHECK-F16C: vcvtps2ph -; CHECK-F16C: vcvtps2ph -; CHECK-F16C: vcvtps2ph -; CHECK: movw -; CHECK: movw -; CHECK: movw -; CHECK: movw +define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 { +; BWON-NOF16C-LABEL: test_trunc32_vec4: +; BWON-NOF16C: # BB#0: +; BWON-NOF16C-NEXT: pushq %rbp +; BWON-NOF16C-NEXT: pushq %r15 +; BWON-NOF16C-NEXT: pushq %r14 +; BWON-NOF16C-NEXT: pushq %rbx +; BWON-NOF16C-NEXT: subq $24, %rsp +; BWON-NOF16C-NEXT: movq %rdi, %rbx +; BWON-NOF16C-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; BWON-NOF16C-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee +; BWON-NOF16C-NEXT: movl %eax, %r14d +; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; BWON-NOF16C-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee +; BWON-NOF16C-NEXT: movl %eax, %r15d +; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; BWON-NOF16C-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee +; BWON-NOF16C-NEXT: movl %eax, %ebp +; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee +; BWON-NOF16C-NEXT: movw %ax, (%rbx) +; BWON-NOF16C-NEXT: movw %bp, 6(%rbx) +; BWON-NOF16C-NEXT: movw %r15w, 4(%rbx) +; BWON-NOF16C-NEXT: movw %r14w, 2(%rbx) +; BWON-NOF16C-NEXT: addq $24, %rsp +; BWON-NOF16C-NEXT: popq %rbx +; BWON-NOF16C-NEXT: popq %r14 +; BWON-NOF16C-NEXT: popq %r15 +; BWON-NOF16C-NEXT: popq %rbp +; BWON-NOF16C-NEXT: retq +; +; BWOFF-LABEL: test_trunc32_vec4: +; BWOFF: # BB#0: +; BWOFF-NEXT: pushq %rbp +; BWOFF-NEXT: pushq %r15 +; BWOFF-NEXT: pushq %r14 +; BWOFF-NEXT: pushq %rbx +; BWOFF-NEXT: subq $24, %rsp +; BWOFF-NEXT: movq %rdi, %rbx +; BWOFF-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; BWOFF-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; BWOFF-NEXT: callq __gnu_f2h_ieee +; BWOFF-NEXT: movw %ax, %r14w +; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; BWOFF-NEXT: callq __gnu_f2h_ieee +; BWOFF-NEXT: movw %ax, %r15w +; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; BWOFF-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; BWOFF-NEXT: callq __gnu_f2h_ieee +; BWOFF-NEXT: movw %ax, %bp +; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; BWOFF-NEXT: callq __gnu_f2h_ieee +; BWOFF-NEXT: movw %ax, (%rbx) +; BWOFF-NEXT: movw %bp, 6(%rbx) +; BWOFF-NEXT: movw %r15w, 4(%rbx) +; BWOFF-NEXT: movw %r14w, 2(%rbx) +; BWOFF-NEXT: addq $24, %rsp +; BWOFF-NEXT: popq %rbx +; BWOFF-NEXT: popq %r14 +; BWOFF-NEXT: popq %r15 +; BWOFF-NEXT: popq %rbp +; BWOFF-NEXT: retq +; +; BWON-F16C-LABEL: test_trunc32_vec4: +; BWON-F16C: # BB#0: +; BWON-F16C-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; BWON-F16C-NEXT: vmovd %xmm1, %eax +; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; BWON-F16C-NEXT: vmovd %xmm1, %ecx +; BWON-F16C-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; BWON-F16C-NEXT: vmovd %xmm1, %edx +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: vmovd %xmm0, %esi +; BWON-F16C-NEXT: movw %si, (%rdi) +; BWON-F16C-NEXT: movw %dx, 6(%rdi) +; BWON-F16C-NEXT: movw %cx, 4(%rdi) +; BWON-F16C-NEXT: movw %ax, 2(%rdi) +; BWON-F16C-NEXT: retq +; +; CHECK-I686-LABEL: test_trunc32_vec4: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: pushl %ebp +; CHECK-I686-NEXT: pushl %ebx +; CHECK-I686-NEXT: pushl %edi +; CHECK-I686-NEXT: pushl %esi +; CHECK-I686-NEXT: subl $44, %esp +; CHECK-I686-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) # 16-byte Spill +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-I686-NEXT: movaps %xmm0, %xmm1 +; CHECK-I686-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; CHECK-I686-NEXT: movss %xmm1, (%esp) +; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: movw %ax, %si +; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-I686-NEXT: movss %xmm0, (%esp) +; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: movw %ax, %di +; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; CHECK-I686-NEXT: movss %xmm0, (%esp) +; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: movw %ax, %bx +; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: movss %xmm0, (%esp) +; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: movw %ax, (%ebp) +; CHECK-I686-NEXT: movw %bx, 6(%ebp) +; CHECK-I686-NEXT: movw %di, 4(%ebp) +; CHECK-I686-NEXT: movw %si, 2(%ebp) +; CHECK-I686-NEXT: addl $44, %esp +; CHECK-I686-NEXT: popl %esi +; CHECK-I686-NEXT: popl %edi +; CHECK-I686-NEXT: popl %ebx +; CHECK-I686-NEXT: popl %ebp +; CHECK-I686-NEXT: retl %v = fptrunc <4 x float> %a to <4 x half> store <4 x half> %v, <4 x half>* %p ret void } -define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) { -; CHECK-LABEL: test_trunc64_vec4: -; CHECK: callq __truncdfhf2 -; CHECK: callq __truncdfhf2 -; CHECK: callq __truncdfhf2 -; CHECK: callq __truncdfhf2 -; CHECK: movw -; CHECK: movw -; CHECK: movw -; CHECK: movw +define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) #0 { +; BWON-NOF16C-LABEL: test_trunc64_vec4: +; BWON-NOF16C: # BB#0: +; BWON-NOF16C-NEXT: pushq %rbp +; BWON-NOF16C-NEXT: pushq %r15 +; BWON-NOF16C-NEXT: pushq %r14 +; BWON-NOF16C-NEXT: pushq %rbx +; BWON-NOF16C-NEXT: subq $40, %rsp +; BWON-NOF16C-NEXT: movq %rdi, %rbx +; BWON-NOF16C-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; BWON-NOF16C-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; BWON-NOF16C-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; BWON-NOF16C-NEXT: callq __truncdfhf2 +; BWON-NOF16C-NEXT: movl %eax, %r14d +; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; BWON-NOF16C-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; BWON-NOF16C-NEXT: callq __truncdfhf2 +; BWON-NOF16C-NEXT: movl %eax, %r15d +; BWON-NOF16C-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; BWON-NOF16C-NEXT: callq __truncdfhf2 +; BWON-NOF16C-NEXT: movl %eax, %ebp +; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; BWON-NOF16C-NEXT: callq __truncdfhf2 +; BWON-NOF16C-NEXT: movw %ax, 4(%rbx) +; BWON-NOF16C-NEXT: movw %bp, (%rbx) +; BWON-NOF16C-NEXT: movw %r15w, 6(%rbx) +; BWON-NOF16C-NEXT: movw %r14w, 2(%rbx) +; BWON-NOF16C-NEXT: addq $40, %rsp +; BWON-NOF16C-NEXT: popq %rbx +; BWON-NOF16C-NEXT: popq %r14 +; BWON-NOF16C-NEXT: popq %r15 +; BWON-NOF16C-NEXT: popq %rbp +; BWON-NOF16C-NEXT: retq +; +; BWOFF-LABEL: test_trunc64_vec4: +; BWOFF: # BB#0: +; BWOFF-NEXT: pushq %rbp +; BWOFF-NEXT: pushq %r15 +; BWOFF-NEXT: pushq %r14 +; BWOFF-NEXT: pushq %rbx +; BWOFF-NEXT: subq $40, %rsp +; BWOFF-NEXT: movq %rdi, %rbx +; BWOFF-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; BWOFF-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; BWOFF-NEXT: callq __truncdfhf2 +; BWOFF-NEXT: movw %ax, %r14w +; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; BWOFF-NEXT: callq __truncdfhf2 +; BWOFF-NEXT: movw %ax, %r15w +; BWOFF-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; BWOFF-NEXT: callq __truncdfhf2 +; BWOFF-NEXT: movw %ax, %bp +; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; BWOFF-NEXT: callq __truncdfhf2 +; BWOFF-NEXT: movw %ax, 4(%rbx) +; BWOFF-NEXT: movw %bp, (%rbx) +; BWOFF-NEXT: movw %r15w, 6(%rbx) +; BWOFF-NEXT: movw %r14w, 2(%rbx) +; BWOFF-NEXT: addq $40, %rsp +; BWOFF-NEXT: popq %rbx +; BWOFF-NEXT: popq %r14 +; BWOFF-NEXT: popq %r15 +; BWOFF-NEXT: popq %rbp +; BWOFF-NEXT: retq +; +; BWON-F16C-LABEL: test_trunc64_vec4: +; BWON-F16C: # BB#0: +; BWON-F16C-NEXT: pushq %rbp +; BWON-F16C-NEXT: pushq %r15 +; BWON-F16C-NEXT: pushq %r14 +; BWON-F16C-NEXT: pushq %rbx +; BWON-F16C-NEXT: subq $88, %rsp +; BWON-F16C-NEXT: movq %rdi, %rbx +; BWON-F16C-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill +; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; BWON-F16C-NEXT: vzeroupper +; BWON-F16C-NEXT: callq __truncdfhf2 +; BWON-F16C-NEXT: movl %eax, %r14d +; BWON-F16C-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; BWON-F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 +; BWON-F16C-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; BWON-F16C-NEXT: vzeroupper +; BWON-F16C-NEXT: callq __truncdfhf2 +; BWON-F16C-NEXT: movl %eax, %r15d +; BWON-F16C-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload +; BWON-F16C-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; BWON-F16C-NEXT: vzeroupper +; BWON-F16C-NEXT: callq __truncdfhf2 +; BWON-F16C-NEXT: movl %eax, %ebp +; BWON-F16C-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; BWON-F16C-NEXT: callq __truncdfhf2 +; BWON-F16C-NEXT: movw %ax, 4(%rbx) +; BWON-F16C-NEXT: movw %bp, (%rbx) +; BWON-F16C-NEXT: movw %r15w, 6(%rbx) +; BWON-F16C-NEXT: movw %r14w, 2(%rbx) +; BWON-F16C-NEXT: addq $88, %rsp +; BWON-F16C-NEXT: popq %rbx +; BWON-F16C-NEXT: popq %r14 +; BWON-F16C-NEXT: popq %r15 +; BWON-F16C-NEXT: popq %rbp +; BWON-F16C-NEXT: retq +; +; CHECK-I686-LABEL: test_trunc64_vec4: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: pushl %ebp +; CHECK-I686-NEXT: pushl %ebx +; CHECK-I686-NEXT: pushl %edi +; CHECK-I686-NEXT: pushl %esi +; CHECK-I686-NEXT: subl $60, %esp +; CHECK-I686-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) # 16-byte Spill +; CHECK-I686-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) # 16-byte Spill +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-I686-NEXT: movlps %xmm0, (%esp) +; CHECK-I686-NEXT: calll __truncdfhf2 +; CHECK-I686-NEXT: movw %ax, %si +; CHECK-I686-NEXT: movapd {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: movhpd %xmm0, (%esp) +; CHECK-I686-NEXT: calll __truncdfhf2 +; CHECK-I686-NEXT: movw %ax, %di +; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: movlps %xmm0, (%esp) +; CHECK-I686-NEXT: calll __truncdfhf2 +; CHECK-I686-NEXT: movw %ax, %bx +; CHECK-I686-NEXT: movapd {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: movhpd %xmm0, (%esp) +; CHECK-I686-NEXT: calll __truncdfhf2 +; CHECK-I686-NEXT: movw %ax, 6(%ebp) +; CHECK-I686-NEXT: movw %bx, 4(%ebp) +; CHECK-I686-NEXT: movw %di, 2(%ebp) +; CHECK-I686-NEXT: movw %si, (%ebp) +; CHECK-I686-NEXT: addl $60, %esp +; CHECK-I686-NEXT: popl %esi +; CHECK-I686-NEXT: popl %edi +; CHECK-I686-NEXT: popl %ebx +; CHECK-I686-NEXT: popl %ebp +; CHECK-I686-NEXT: retl %v = fptrunc <4 x double> %a to <4 x half> store <4 x half> %v, <4 x half>* %p ret void @@ -272,40 +839,98 @@ declare float @test_floatret(); ; to f80 and then rounded to f32. The DAG combiner should not combine this ; fp_round and the subsequent fptrunc from float to half. define half @test_f80trunc_nodagcombine() #0 { -; CHECK-LABEL: test_f80trunc_nodagcombine: -; CHECK-I686-NOT: calll __truncxfhf2 +; CHECK-LIBCALL-LABEL: test_f80trunc_nodagcombine: +; CHECK-LIBCALL: # BB#0: +; CHECK-LIBCALL-NEXT: pushq %rax +; CHECK-LIBCALL-NEXT: callq test_floatret +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee +; CHECK-LIBCALL-NEXT: movzwl %ax, %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: popq %rax +; CHECK-LIBCALL-NEXT: retq +; +; BWON-F16C-LABEL: test_f80trunc_nodagcombine: +; BWON-F16C: # BB#0: +; BWON-F16C-NEXT: pushq %rax +; BWON-F16C-NEXT: callq test_floatret +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: popq %rax +; BWON-F16C-NEXT: retq +; +; CHECK-I686-LABEL: test_f80trunc_nodagcombine: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: subl $12, %esp +; CHECK-I686-NEXT: calll test_floatret +; CHECK-I686-NEXT: fstps (%esp) +; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: movzwl %ax, %eax +; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: addl $12, %esp +; CHECK-I686-NEXT: retl %1 = call float @test_floatret() %2 = fptrunc float %1 to half ret half %2 } -; CHECK-LABEL: test_sitofp_fadd_i32: -; CHECK-LIBCALL-NEXT: pushq %rbx -; CHECK-LIBCALL-NEXT: subq $16, %rsp -; CHECK-LIBCALL-NEXT: movl %edi, %ebx -; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: movss %xmm0, 12(%rsp) -; CHECK-LIBCALL-NEXT: cvtsi2ssl %ebx, %xmm0 -; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee -; CHECK-LIBCALL-NEXT: movzwl %ax, %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee -; CHECK-LIBCALL-NEXT: addss 12(%rsp), %xmm0 -; CHECK-LIBCALL-NEXT: addq $16, %rsp -; CHECK-LIBCALL-NEXT: popq %rbx -; CHECK-LIBCALL-NEXT: retq -; CHECK-F16C-NEXT: movswl (%rsi), %eax -; CHECK-F16C-NEXT: vmovd %eax, %xmm0 -; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm1, %xmm1 -; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; CHECK-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-F16C-NEXT: retq define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 { +; CHECK-LIBCALL-LABEL: test_sitofp_fadd_i32: +; CHECK-LIBCALL: # BB#0: +; CHECK-LIBCALL-NEXT: pushq %rbx +; CHECK-LIBCALL-NEXT: subq $16, %rsp +; CHECK-LIBCALL-NEXT: movl %edi, %ebx +; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill +; CHECK-LIBCALL-NEXT: cvtsi2ssl %ebx, %xmm0 +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee +; CHECK-LIBCALL-NEXT: movzwl %ax, %edi +; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee +; CHECK-LIBCALL-NEXT: addss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload +; CHECK-LIBCALL-NEXT: addq $16, %rsp +; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: retq +; +; BWON-F16C-LABEL: test_sitofp_fadd_i32: +; BWON-F16C: # BB#0: +; BWON-F16C-NEXT: movswl (%rsi), %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm0 +; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtsi2ssl %edi, %xmm1, %xmm1 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 +; BWON-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; BWON-F16C-NEXT: retq +; +; CHECK-I686-LABEL: test_sitofp_fadd_i32: +; CHECK-I686: # BB#0: +; CHECK-I686-NEXT: subl $28, %esp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: movzwl (%eax), %eax +; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: movss %xmm0, {{[0-9]+}}(%esp) # 4-byte Spill +; CHECK-I686-NEXT: xorps %xmm0, %xmm0 +; CHECK-I686-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 +; CHECK-I686-NEXT: movss %xmm0, (%esp) +; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: movzwl %ax, %eax +; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: movss {{[0-9]+}}(%esp), %xmm0 # 4-byte Reload +; CHECK-I686-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: addss {{[0-9]+}}(%esp), %xmm0 +; CHECK-I686-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: addl $28, %esp +; CHECK-I686-NEXT: retl %tmp0 = load half, half* %b %tmp1 = sitofp i32 %a to half %tmp2 = fadd half %tmp0, %tmp1 diff --git a/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/test/CodeGen/X86/illegal-bitfield-loadstore.ll index ceb4657119065..5425670fbb1ed 100644 --- a/test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ b/test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -1,17 +1,30 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X64 define void @i24_or(i24* %a) { -; CHECK-LABEL: i24_or: -; CHECK: # BB#0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzbl 2(%rdi), %ecx -; CHECK-NEXT: movb %cl, 2(%rdi) -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: orl $384, %ecx # imm = 0x180 -; CHECK-NEXT: movw %cx, (%rdi) -; CHECK-NEXT: retq +; X86-LABEL: i24_or: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: movzbl 2(%ecx), %eax +; X86-NEXT: movb %al, 2(%ecx) +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl $384, %eax # imm = 0x180 +; X86-NEXT: movw %ax, (%ecx) +; X86-NEXT: retl +; +; X64-LABEL: i24_or: +; X64: # BB#0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzbl 2(%rdi), %ecx +; X64-NEXT: movb %cl, 2(%rdi) +; X64-NEXT: shll $16, %ecx +; X64-NEXT: orl %eax, %ecx +; X64-NEXT: orl $384, %ecx # imm = 0x180 +; X64-NEXT: movw %cx, (%rdi) +; X64-NEXT: retq %aa = load i24, i24* %a, align 1 %b = or i24 %aa, 384 store i24 %b, i24* %a, align 1 @@ -19,17 +32,30 @@ define void @i24_or(i24* %a) { } define void @i24_and_or(i24* %a) { -; CHECK-LABEL: i24_and_or: -; CHECK: # BB#0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzbl 2(%rdi), %ecx -; CHECK-NEXT: movb %cl, 2(%rdi) -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: orl $384, %ecx # imm = 0x180 -; CHECK-NEXT: andl $16777088, %ecx # imm = 0xFFFF80 -; CHECK-NEXT: movw %cx, (%rdi) -; CHECK-NEXT: retq +; X86-LABEL: i24_and_or: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: movzbl 2(%ecx), %eax +; X86-NEXT: movb %al, 2(%ecx) +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl $384, %eax # imm = 0x180 +; X86-NEXT: andl $16777088, %eax # imm = 0xFFFF80 +; X86-NEXT: movw %ax, (%ecx) +; X86-NEXT: retl +; +; X64-LABEL: i24_and_or: +; X64: # BB#0: +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: movzbl 2(%rdi), %ecx +; X64-NEXT: movb %cl, 2(%rdi) +; X64-NEXT: shll $16, %ecx +; X64-NEXT: orl %eax, %ecx +; X64-NEXT: orl $384, %ecx # imm = 0x180 +; X64-NEXT: andl $16777088, %ecx # imm = 0xFFFF80 +; X64-NEXT: movw %cx, (%rdi) +; X64-NEXT: retq %b = load i24, i24* %a, align 1 %c = and i24 %b, -128 %d = or i24 %c, 384 @@ -38,19 +64,40 @@ define void @i24_and_or(i24* %a) { } define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { -; CHECK-LABEL: i24_insert_bit: -; CHECK: # BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: movzwl (%rdi), %ecx -; CHECK-NEXT: movzbl 2(%rdi), %edx -; CHECK-NEXT: movb %dl, 2(%rdi) -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: shll $13, %eax -; CHECK-NEXT: andl $16769023, %edx # imm = 0xFFDFFF -; CHECK-NEXT: orl %eax, %edx -; CHECK-NEXT: movw %dx, (%rdi) -; CHECK-NEXT: retq +; X86-LABEL: i24_insert_bit: +; X86: # BB#0: +; X86-NEXT: pushl %esi +; X86-NEXT: .Lcfi0: +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .Lcfi1: +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl (%ecx), %esi +; X86-NEXT: movzbl 2(%ecx), %eax +; X86-NEXT: movb %al, 2(%ecx) +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: shll $13, %edx +; X86-NEXT: andl $16769023, %eax # imm = 0xFFDFFF +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movw %ax, (%ecx) +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: i24_insert_bit: +; X64: # BB#0: +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: movzwl (%rdi), %ecx +; X64-NEXT: movzbl 2(%rdi), %edx +; X64-NEXT: movb %dl, 2(%rdi) +; X64-NEXT: shll $16, %edx +; X64-NEXT: orl %ecx, %edx +; X64-NEXT: shll $13, %eax +; X64-NEXT: andl $16769023, %edx # imm = 0xFFDFFF +; X64-NEXT: orl %eax, %edx +; X64-NEXT: movw %dx, (%rdi) +; X64-NEXT: retq %extbit = zext i1 %bit to i24 %b = load i24, i24* %a, align 1 %extbit.shl = shl nuw nsw i24 %extbit, 13 @@ -61,22 +108,28 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) { } define void @i56_or(i56* %a) { -; CHECK-LABEL: i56_or: -; CHECK: # BB#0: -; CHECK-NEXT: movzwl 4(%rdi), %eax -; CHECK-NEXT: movzbl 6(%rdi), %ecx -; CHECK-NEXT: movl (%rdi), %edx -; CHECK-NEXT: movb %cl, 6(%rdi) -; CHECK-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def> -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: shlq $32, %rcx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: orq $384, %rdx # imm = 0x180 -; CHECK-NEXT: movl %edx, (%rdi) -; CHECK-NEXT: shrq $32, %rdx -; CHECK-NEXT: movw %dx, 4(%rdi) -; CHECK-NEXT: retq +; X86-LABEL: i56_or: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl $384, (%eax) # imm = 0x180 +; X86-NEXT: retl +; +; X64-LABEL: i56_or: +; X64: # BB#0: +; X64-NEXT: movzwl 4(%rdi), %eax +; X64-NEXT: movzbl 6(%rdi), %ecx +; X64-NEXT: movl (%rdi), %edx +; X64-NEXT: movb %cl, 6(%rdi) +; X64-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def> +; X64-NEXT: shll $16, %ecx +; X64-NEXT: orl %eax, %ecx +; X64-NEXT: shlq $32, %rcx +; X64-NEXT: orq %rcx, %rdx +; X64-NEXT: orq $384, %rdx # imm = 0x180 +; X64-NEXT: movl %edx, (%rdi) +; X64-NEXT: shrq $32, %rdx +; X64-NEXT: movw %dx, 4(%rdi) +; X64-NEXT: retq %aa = load i56, i56* %a, align 1 %b = or i56 %aa, 384 store i56 %b, i56* %a, align 1 @@ -84,24 +137,33 @@ define void @i56_or(i56* %a) { } define void @i56_and_or(i56* %a) { -; CHECK-LABEL: i56_and_or: -; CHECK: # BB#0: -; CHECK-NEXT: movzwl 4(%rdi), %eax -; CHECK-NEXT: movzbl 6(%rdi), %ecx -; CHECK-NEXT: movl (%rdi), %edx -; CHECK-NEXT: movb %cl, 6(%rdi) -; CHECK-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def> -; CHECK-NEXT: shll $16, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: shlq $32, %rcx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: orq $384, %rdx # imm = 0x180 -; CHECK-NEXT: movabsq $72057594037927808, %rax # imm = 0xFFFFFFFFFFFF80 -; CHECK-NEXT: andq %rdx, %rax -; CHECK-NEXT: movl %eax, (%rdi) -; CHECK-NEXT: shrq $32, %rax -; CHECK-NEXT: movw %ax, 4(%rdi) -; CHECK-NEXT: retq +; X86-LABEL: i56_and_or: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $384, %ecx # imm = 0x180 +; X86-NEXT: orl (%eax), %ecx +; X86-NEXT: andl $-128, %ecx +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: i56_and_or: +; X64: # BB#0: +; X64-NEXT: movzwl 4(%rdi), %eax +; X64-NEXT: movzbl 6(%rdi), %ecx +; X64-NEXT: movl (%rdi), %edx +; X64-NEXT: movb %cl, 6(%rdi) +; X64-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def> +; X64-NEXT: shll $16, %ecx +; X64-NEXT: orl %eax, %ecx +; X64-NEXT: shlq $32, %rcx +; X64-NEXT: orq %rcx, %rdx +; X64-NEXT: orq $384, %rdx # imm = 0x180 +; X64-NEXT: movabsq $72057594037927808, %rax # imm = 0xFFFFFFFFFFFF80 +; X64-NEXT: andq %rdx, %rax +; X64-NEXT: movl %eax, (%rdi) +; X64-NEXT: shrq $32, %rax +; X64-NEXT: movw %ax, 4(%rdi) +; X64-NEXT: retq %b = load i56, i56* %a, align 1 %c = and i56 %b, -128 %d = or i56 %c, 384 @@ -110,26 +172,37 @@ define void @i56_and_or(i56* %a) { } define void @i56_insert_bit(i56* %a, i1 zeroext %bit) { -; CHECK-LABEL: i56_insert_bit: -; CHECK: # BB#0: -; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: movzwl 4(%rdi), %ecx -; CHECK-NEXT: movzbl 6(%rdi), %edx -; CHECK-NEXT: movl (%rdi), %esi -; CHECK-NEXT: movb %dl, 6(%rdi) -; CHECK-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> %RDX<def> -; CHECK-NEXT: shll $16, %edx -; CHECK-NEXT: orl %ecx, %edx -; CHECK-NEXT: shlq $32, %rdx -; CHECK-NEXT: orq %rdx, %rsi -; CHECK-NEXT: shlq $13, %rax -; CHECK-NEXT: movabsq $72057594037919743, %rcx # imm = 0xFFFFFFFFFFDFFF -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: orq %rax, %rcx -; CHECK-NEXT: movl %ecx, (%rdi) -; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: movw %cx, 4(%rdi) -; CHECK-NEXT: retq +; X86-LABEL: i56_insert_bit: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $13, %ecx +; X86-NEXT: movl $-8193, %edx # imm = 0xDFFF +; X86-NEXT: andl (%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: i56_insert_bit: +; X64: # BB#0: +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: movzwl 4(%rdi), %ecx +; X64-NEXT: movzbl 6(%rdi), %edx +; X64-NEXT: movl (%rdi), %esi +; X64-NEXT: movb %dl, 6(%rdi) +; X64-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> %RDX<def> +; X64-NEXT: shll $16, %edx +; X64-NEXT: orl %ecx, %edx +; X64-NEXT: shlq $32, %rdx +; X64-NEXT: orq %rdx, %rsi +; X64-NEXT: shlq $13, %rax +; X64-NEXT: movabsq $72057594037919743, %rcx # imm = 0xFFFFFFFFFFDFFF +; X64-NEXT: andq %rsi, %rcx +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movl %ecx, (%rdi) +; X64-NEXT: shrq $32, %rcx +; X64-NEXT: movw %cx, 4(%rdi) +; X64-NEXT: retq %extbit = zext i1 %bit to i56 %b = load i56, i56* %a, align 1 %extbit.shl = shl nuw nsw i56 %extbit, 13 diff --git a/test/CodeGen/X86/optimize-max-1.ll b/test/CodeGen/X86/optimize-max-1.ll index 11e2f9a93a57f..08cb86ab39896 100644 --- a/test/CodeGen/X86/optimize-max-1.ll +++ b/test/CodeGen/X86/optimize-max-1.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -march=x86-64 | not grep cmov +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s ; LSR should be able to eliminate both smax and umax expressions ; in loop trip counts. @@ -6,6 +7,18 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" define void @fs(double* nocapture %p, i64 %n) nounwind { +; CHECK-LABEL: fs: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB0_1: # %bb +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movq $0, (%rdi,%rax,8) +; CHECK-NEXT: incq %rax +; CHECK-NEXT: cmpq %rsi, %rax +; CHECK-NEXT: jl .LBB0_1 +; CHECK-NEXT: # BB#2: # %return +; CHECK-NEXT: retq entry: %tmp = icmp slt i64 %n, 1 ; <i1> [#uses=1] %smax = select i1 %tmp, i64 1, i64 %n ; <i64> [#uses=1] @@ -24,6 +37,18 @@ return: ; preds = %bb } define void @bs(double* nocapture %p, i64 %n) nounwind { +; CHECK-LABEL: bs: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB1_1: # %bb +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movq $0, (%rdi,%rax,8) +; CHECK-NEXT: incq %rax +; CHECK-NEXT: cmpq %rsi, %rax +; CHECK-NEXT: jl .LBB1_1 +; CHECK-NEXT: # BB#2: # %return +; CHECK-NEXT: retq entry: %tmp = icmp sge i64 %n, 1 ; <i1> [#uses=1] %smax = select i1 %tmp, i64 %n, i64 1 ; <i64> [#uses=1] @@ -42,6 +67,18 @@ return: ; preds = %bb } define void @fu(double* nocapture %p, i64 %n) nounwind { +; CHECK-LABEL: fu: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB2_1: # %bb +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movq $0, (%rdi,%rax,8) +; CHECK-NEXT: incq %rax +; CHECK-NEXT: cmpq %rsi, %rax +; CHECK-NEXT: jb .LBB2_1 +; CHECK-NEXT: # BB#2: # %return +; CHECK-NEXT: retq entry: %tmp = icmp eq i64 %n, 0 ; <i1> [#uses=1] %umax = select i1 %tmp, i64 1, i64 %n ; <i64> [#uses=1] @@ -60,6 +97,18 @@ return: ; preds = %bb } define void @bu(double* nocapture %p, i64 %n) nounwind { +; CHECK-LABEL: bu: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB3_1: # %bb +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movq $0, (%rdi,%rax,8) +; CHECK-NEXT: incq %rax +; CHECK-NEXT: cmpq %rsi, %rax +; CHECK-NEXT: jb .LBB3_1 +; CHECK-NEXT: # BB#2: # %return +; CHECK-NEXT: retq entry: %tmp = icmp ne i64 %n, 0 ; <i1> [#uses=1] %umax = select i1 %tmp, i64 %n, i64 1 ; <i64> [#uses=1] diff --git a/test/CodeGen/X86/optimize-max-2.ll b/test/CodeGen/X86/optimize-max-2.ll index 45b542e2267c1..37d2a20975a04 100644 --- a/test/CodeGen/X86/optimize-max-2.ll +++ b/test/CodeGen/X86/optimize-max-2.ll @@ -1,8 +1,5 @@ -; RUN: llc < %s -march=x86-64 | grep cmov | count 2 -; RUN: llc < %s -march=x86-64 | FileCheck %s - -; CHECK: jne -; CHECK-NOT: jne +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s ; LSR's OptimizeMax function shouldn't try to eliminate this max, because ; it has three operands. @@ -10,6 +7,24 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" define void @foo(double* nocapture %p, i64 %x, i64 %y) nounwind { +; CHECK-LABEL: foo: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovneq %rdx, %rax +; CHECK-NEXT: cmpq %rsi, %rax +; CHECK-NEXT: cmovbeq %rsi, %rax +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB0_1: # %bb4 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: addsd %xmm0, %xmm0 +; CHECK-NEXT: movsd %xmm0, (%rdi) +; CHECK-NEXT: addq $8, %rdi +; CHECK-NEXT: decq %rax +; CHECK-NEXT: jne .LBB0_1 +; CHECK-NEXT: # BB#2: # %return +; CHECK-NEXT: retq entry: %tmp = icmp eq i64 %y, 0 ; <i1> [#uses=1] %umax = select i1 %tmp, i64 1, i64 %y ; <i64> [#uses=2] @@ -30,3 +45,4 @@ bb4: ; preds = %bb4, %entry return: ; preds = %bb4 ret void } + diff --git a/test/CodeGen/X86/pr15309.ll b/test/CodeGen/X86/pr15309.ll index e9d9b9e54c137..0301b58def1c3 100644 --- a/test/CodeGen/X86/pr15309.ll +++ b/test/CodeGen/X86/pr15309.ll @@ -1,15 +1,43 @@ -; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-linux-pc | FileCheck %s -define void @test_convert_float2_ulong2(<2 x i64>* nocapture %src, <2 x float>* nocapture %dest) noinline { -L.entry: - %0 = getelementptr <2 x i64>, <2 x i64>* %src, i32 10 - %1 = load <2 x i64>, <2 x i64>* %0, align 16 - %2 = uitofp <2 x i64> %1 to <2 x float> - %3 = getelementptr <2 x float>, <2 x float>* %dest, i32 10 - store <2 x float> %2, <2 x float>* %3, align 8 +define void @test_convert_float2_ulong2(<2 x i64>* nocapture %src, <2 x float>* nocapture %dest) nounwind { +; CHECK-LABEL: test_convert_float2_ulong2: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: subl $20, %esp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl 168(%ecx), %edx +; CHECK-NEXT: movl 172(%ecx), %esi +; CHECK-NEXT: movl 160(%ecx), %edi +; CHECK-NEXT: movl 164(%ecx), %ecx +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edi, (%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: testl %ecx, %ecx +; CHECK-NEXT: setns %dl +; CHECK-NEXT: fildll (%esp) +; CHECK-NEXT: fadds {{\.LCPI.*}}(,%edx,4) +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: setns %cl +; CHECK-NEXT: fildll {{[0-9]+}}(%esp) +; CHECK-NEXT: fadds {{\.LCPI.*}}(,%ecx,4) +; CHECK-NEXT: fstps 84(%eax) +; CHECK-NEXT: fstps 80(%eax) +; CHECK-NEXT: addl $20, %esp +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: retl + %t0 = getelementptr <2 x i64>, <2 x i64>* %src, i32 10 + %t1 = load <2 x i64>, <2 x i64>* %t0, align 16 + %t2 = uitofp <2 x i64> %t1 to <2 x float> + %t3 = getelementptr <2 x float>, <2 x float>* %dest, i32 10 + store <2 x float> %t2, <2 x float>* %t3, align 8 ret void } -; CHECK: test_convert_float2_ulong2 -; CHECK-NOT: cvtpd2ps -; CHECK: ret diff --git a/test/CodeGen/X86/pr23603.ll b/test/CodeGen/X86/pr23603.ll index 6f856aedb8d58..315e60768613a 100644 --- a/test/CodeGen/X86/pr23603.ll +++ b/test/CodeGen/X86/pr23603.ll @@ -1,14 +1,29 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s declare void @free_v() -define void @f(i32* %x, i32 %c32, i32* %y) { -; CHECK-LABEL: f +define void @f(i32* %x, i32 %c32, i32* %y) nounwind { +; CHECK-LABEL: f: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rdx, %r14 +; CHECK-NEXT: movl %esi, %ebp +; CHECK-NEXT: movl (%rdi), %ebx +; CHECK-NEXT: callq free_v +; CHECK-NEXT: testl %ebp, %ebp +; CHECK-NEXT: je .LBB0_2 +; CHECK-NEXT: # BB#1: # %left +; CHECK-NEXT: movl %ebx, (%r14) +; CHECK-NEXT: .LBB0_2: # %merge +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq entry: %v = load i32, i32* %x, !invariant.load !0 -; CHECK: movl (%rdi), %ebx -; CHECK: free_v -; CHECK-NOT: movl (%rdi), %ebx call void @free_v() %c = icmp ne i32 %c32, 0 br i1 %c, label %left, label %merge diff --git a/test/CodeGen/X86/pr33715.ll b/test/CodeGen/X86/pr33715.ll new file mode 100644 index 0000000000000..15432cfdb512c --- /dev/null +++ b/test/CodeGen/X86/pr33715.ll @@ -0,0 +1,16 @@ +; Make sure we don't crash with a build vector of integer constants. +; RUN: llc %s -o /dev/null + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @patatino() { + %tmp = insertelement <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>, i32 1, i32 2 + %tmp1 = insertelement <4 x i32> %tmp, i32 1, i32 3 + %tmp2 = icmp ne <4 x i32> %tmp1, zeroinitializer + %tmp3 = icmp slt <4 x i32> %tmp1, <i32 4, i32 4, i32 4, i32 4> + %tmp4 = or <4 x i1> %tmp2, %tmp3 + %tmp5 = select <4 x i1> %tmp4, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 4, i32 4, i32 4> + %tmp6 = extractelement <4 x i32> %tmp5, i32 0 + ret i32 %tmp6 +} diff --git a/test/CodeGen/X86/rdrand-x86_64.ll b/test/CodeGen/X86/rdrand-x86_64.ll new file mode 100644 index 0000000000000..06f1136087bbd --- /dev/null +++ b/test/CodeGen/X86/rdrand-x86_64.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdrnd | FileCheck %s + +declare {i64, i32} @llvm.x86.rdrand.64() + +define i32 @_rdrand64_step(i64* %random_val) { +; CHECK-LABEL: _rdrand64_step: +; CHECK: # BB#0: +; CHECK-NEXT: rdrandq %rcx +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovael %ecx, %eax +; CHECK-NEXT: movq %rcx, (%rdi) +; CHECK-NEXT: retq + %call = call {i64, i32} @llvm.x86.rdrand.64() + %randval = extractvalue {i64, i32} %call, 0 + store i64 %randval, i64* %random_val + %isvalid = extractvalue {i64, i32} %call, 1 + ret i32 %isvalid +} diff --git a/test/CodeGen/X86/rdrand.ll b/test/CodeGen/X86/rdrand.ll index 107cde05a0e6f..0638e00952822 100644 --- a/test/CodeGen/X86/rdrand.ll +++ b/test/CodeGen/X86/rdrand.ll @@ -1,66 +1,117 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdrnd | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=core-avx-i -mattr=+rdrnd | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdrnd | FileCheck %s --check-prefix=X64 + declare {i16, i32} @llvm.x86.rdrand.16() declare {i32, i32} @llvm.x86.rdrand.32() -declare {i64, i32} @llvm.x86.rdrand.64() define i32 @_rdrand16_step(i16* %random_val) { +; X86-LABEL: _rdrand16_step: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: rdrandw %ax +; X86-NEXT: movzwl %ax, %edx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovael %edx, %eax +; X86-NEXT: movw %dx, (%ecx) +; X86-NEXT: retl +; +; X64-LABEL: _rdrand16_step: +; X64: # BB#0: +; X64-NEXT: rdrandw %ax +; X64-NEXT: movzwl %ax, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovael %ecx, %eax +; X64-NEXT: movw %cx, (%rdi) +; X64-NEXT: retq %call = call {i16, i32} @llvm.x86.rdrand.16() %randval = extractvalue {i16, i32} %call, 0 store i16 %randval, i16* %random_val %isvalid = extractvalue {i16, i32} %call, 1 ret i32 %isvalid -; CHECK-LABEL: _rdrand16_step: -; CHECK: rdrandw %ax -; CHECK: movzwl %ax, %ecx -; CHECK: movl $1, %eax -; CHECK: cmovael %ecx, %eax -; CHECK: movw %cx, (%r[[A0:di|cx]]) -; CHECK: ret } define i32 @_rdrand32_step(i32* %random_val) { +; X86-LABEL: _rdrand32_step: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: rdrandl %edx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovael %edx, %eax +; X86-NEXT: movl %edx, (%ecx) +; X86-NEXT: retl +; +; X64-LABEL: _rdrand32_step: +; X64: # BB#0: +; X64-NEXT: rdrandl %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovael %ecx, %eax +; X64-NEXT: movl %ecx, (%rdi) +; X64-NEXT: retq %call = call {i32, i32} @llvm.x86.rdrand.32() %randval = extractvalue {i32, i32} %call, 0 store i32 %randval, i32* %random_val %isvalid = extractvalue {i32, i32} %call, 1 ret i32 %isvalid -; CHECK-LABEL: _rdrand32_step: -; CHECK: rdrandl %e[[T0:[a-z]+]] -; CHECK: movl $1, %eax -; CHECK: cmovael %e[[T0]], %eax -; CHECK: movl %e[[T0]], (%r[[A0]]) -; CHECK: ret -} - -define i32 @_rdrand64_step(i64* %random_val) { - %call = call {i64, i32} @llvm.x86.rdrand.64() - %randval = extractvalue {i64, i32} %call, 0 - store i64 %randval, i64* %random_val - %isvalid = extractvalue {i64, i32} %call, 1 - ret i32 %isvalid -; CHECK-LABEL: _rdrand64_step: -; CHECK: rdrandq %r[[T1:[a-z]+]] -; CHECK: movl $1, %eax -; CHECK: cmovael %e[[T1]], %eax -; CHECK: movq %r[[T1]], (%r[[A0]]) -; CHECK: ret } ; Check that MachineCSE doesn't eliminate duplicate rdrand instructions. define i32 @CSE() nounwind { +; X86-LABEL: CSE: +; X86: # BB#0: +; X86-NEXT: rdrandl %ecx +; X86-NEXT: rdrandl %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: retl +; +; X64-LABEL: CSE: +; X64: # BB#0: +; X64-NEXT: rdrandl %ecx +; X64-NEXT: rdrandl %eax +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: retq %rand1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind %v1 = extractvalue { i32, i32 } %rand1, 0 %rand2 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind %v2 = extractvalue { i32, i32 } %rand2, 0 %add = add i32 %v2, %v1 ret i32 %add -; CHECK-LABEL: CSE: -; CHECK: rdrandl -; CHECK: rdrandl } ; Check that MachineLICM doesn't hoist rdrand instructions. define void @loop(i32* %p, i32 %n) nounwind { +; X86-LABEL: loop: +; X86: # BB#0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: je .LBB3_3 +; X86-NEXT: # BB#1: # %while.body.preheader +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: .p2align 4, 0x90 +; X86-NEXT: .LBB3_2: # %while.body +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: rdrandl %edx +; X86-NEXT: movl %edx, (%ecx) +; X86-NEXT: leal 4(%ecx), %ecx +; X86-NEXT: decl %eax +; X86-NEXT: jne .LBB3_2 +; X86-NEXT: .LBB3_3: # %while.end +; X86-NEXT: retl +; +; X64-LABEL: loop: +; X64: # BB#0: # %entry +; X64-NEXT: testl %esi, %esi +; X64-NEXT: je .LBB3_2 +; X64-NEXT: .p2align 4, 0x90 +; X64-NEXT: .LBB3_1: # %while.body +; X64-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-NEXT: rdrandl %eax +; X64-NEXT: movl %eax, (%rdi) +; X64-NEXT: leaq 4(%rdi), %rdi +; X64-NEXT: decl %esi +; X64-NEXT: jne .LBB3_1 +; X64-NEXT: .LBB3_2: # %while.end +; X64-NEXT: retq entry: %tobool1 = icmp eq i32 %n, 0 br i1 %tobool1, label %while.end, label %while.body @@ -78,8 +129,4 @@ while.body: ; preds = %entry, %while.body while.end: ; preds = %while.body, %entry ret void -; CHECK-LABEL: loop: -; CHECK-NOT: rdrandl -; CHECK: This Inner Loop Header: Depth=1 -; CHECK: rdrandl } diff --git a/test/CodeGen/X86/rdseed-x86_64.ll b/test/CodeGen/X86/rdseed-x86_64.ll new file mode 100644 index 0000000000000..b0d9748dd6ae1 --- /dev/null +++ b/test/CodeGen/X86/rdseed-x86_64.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdseed | FileCheck %s + +declare {i64, i32} @llvm.x86.rdseed.64() + +define i32 @_rdseed64_step(i64* %random_val) { +; CHECK-LABEL: _rdseed64_step: +; CHECK: # BB#0: +; CHECK-NEXT: rdseedq %rcx +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovael %ecx, %eax +; CHECK-NEXT: movq %rcx, (%rdi) +; CHECK-NEXT: retq + %call = call {i64, i32} @llvm.x86.rdseed.64() + %randval = extractvalue {i64, i32} %call, 0 + store i64 %randval, i64* %random_val + %isvalid = extractvalue {i64, i32} %call, 1 + ret i32 %isvalid +} diff --git a/test/CodeGen/X86/rdseed.ll b/test/CodeGen/X86/rdseed.ll index c219b4ad27ece..b22e3e7ceac07 100644 --- a/test/CodeGen/X86/rdseed.ll +++ b/test/CodeGen/X86/rdseed.ll @@ -1,48 +1,56 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdseed | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=core-avx-i -mattr=+rdseed | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdseed | FileCheck %s --check-prefix=X64 declare {i16, i32} @llvm.x86.rdseed.16() declare {i32, i32} @llvm.x86.rdseed.32() -declare {i64, i32} @llvm.x86.rdseed.64() define i32 @_rdseed16_step(i16* %random_val) { +; X86-LABEL: _rdseed16_step: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: rdseedw %ax +; X86-NEXT: movzwl %ax, %edx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovael %edx, %eax +; X86-NEXT: movw %dx, (%ecx) +; X86-NEXT: retl +; +; X64-LABEL: _rdseed16_step: +; X64: # BB#0: +; X64-NEXT: rdseedw %ax +; X64-NEXT: movzwl %ax, %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovael %ecx, %eax +; X64-NEXT: movw %cx, (%rdi) +; X64-NEXT: retq %call = call {i16, i32} @llvm.x86.rdseed.16() %randval = extractvalue {i16, i32} %call, 0 store i16 %randval, i16* %random_val %isvalid = extractvalue {i16, i32} %call, 1 ret i32 %isvalid -; CHECK-LABEL: _rdseed16_step: -; CHECK: rdseedw %ax -; CHECK: movzwl %ax, %ecx -; CHECK: movl $1, %eax -; CHECK: cmovael %ecx, %eax -; CHECK: movw %cx, (%r[[A0:di|cx]]) -; CHECK: ret } define i32 @_rdseed32_step(i32* %random_val) { +; X86-LABEL: _rdseed32_step: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: rdseedl %edx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovael %edx, %eax +; X86-NEXT: movl %edx, (%ecx) +; X86-NEXT: retl +; +; X64-LABEL: _rdseed32_step: +; X64: # BB#0: +; X64-NEXT: rdseedl %ecx +; X64-NEXT: movl $1, %eax +; X64-NEXT: cmovael %ecx, %eax +; X64-NEXT: movl %ecx, (%rdi) +; X64-NEXT: retq %call = call {i32, i32} @llvm.x86.rdseed.32() %randval = extractvalue {i32, i32} %call, 0 store i32 %randval, i32* %random_val %isvalid = extractvalue {i32, i32} %call, 1 ret i32 %isvalid -; CHECK-LABEL: _rdseed32_step: -; CHECK: rdseedl %e[[T0:[a-z]+]] -; CHECK: movl $1, %eax -; CHECK: cmovael %e[[T0]], %eax -; CHECK: movl %e[[T0]], (%r[[A0]]) -; CHECK: ret -} - -define i32 @_rdseed64_step(i64* %random_val) { - %call = call {i64, i32} @llvm.x86.rdseed.64() - %randval = extractvalue {i64, i32} %call, 0 - store i64 %randval, i64* %random_val - %isvalid = extractvalue {i64, i32} %call, 1 - ret i32 %isvalid -; CHECK-LABEL: _rdseed64_step: -; CHECK: rdseedq %r[[T1:[a-z]+]] -; CHECK: movl $1, %eax -; CHECK: cmovael %e[[T1]], %eax -; CHECK: movq %r[[T1]], (%r[[A0]]) -; CHECK: ret } diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll index 16e261bf3c5e0..02a968c6f27d1 100644 --- a/test/CodeGen/X86/recip-fastmath.ll +++ b/test/CodeGen/X86/recip-fastmath.ll @@ -45,9 +45,9 @@ define float @f32_no_estimate(float %x) #0 { ; ; SANDY-LABEL: f32_no_estimate: ; SANDY: # BB#0: -; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] -; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50] +; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [14:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: f32_no_estimate: ; HASWELL: # BB#0: @@ -113,11 +113,11 @@ define float @f32_one_step(float %x) #1 { ; SANDY: # BB#0: ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50] ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: f32_one_step: ; HASWELL: # BB#0: @@ -207,7 +207,7 @@ define float @f32_two_step(float %x) #2 { ; SANDY: # BB#0: ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00] -; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50] ; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] ; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00] ; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] @@ -215,7 +215,7 @@ define float @f32_two_step(float %x) #2 { ; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: f32_two_step: ; HASWELL: # BB#0: @@ -284,25 +284,25 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 { ; ; SANDY-LABEL: v4f32_no_estimate: ; SANDY: # BB#0: -; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] -; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50] +; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [14:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: v4f32_no_estimate: ; HASWELL: # BB#0: -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 # sched: [4:0.50] +; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [4:0.50] ; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00] ; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v4f32_no_estimate: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] ; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: v4f32_no_estimate: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 # sched: [4:0.50] +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [4:0.50] ; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00] ; AVX512-NEXT: retq # sched: [1:1.00] %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x @@ -350,18 +350,18 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { ; ; SANDY-LABEL: v4f32_one_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00] ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50] ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: v4f32_one_step: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 ; HASWELL-NEXT: retq # sched: [1:1.00] @@ -370,7 +370,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -379,7 +379,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { ; KNL-LABEL: v4f32_one_step: ; KNL: # BB#0: ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 ; KNL-NEXT: retq # sched: [1:1.00] @@ -453,9 +453,9 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { ; ; SANDY-LABEL: v4f32_two_step: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00] ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00] -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50] ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00] ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] @@ -463,12 +463,12 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { ; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: v4f32_two_step: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] ; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 @@ -480,7 +480,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] ; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -493,7 +493,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { ; KNL-LABEL: v4f32_two_step: ; KNL: # BB#0: ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 @@ -504,7 +504,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { ; SKX-LABEL: v4f32_two_step: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 -; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 @@ -541,30 +541,30 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 { ; BTVER2-LABEL: v8f32_no_estimate: ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] -; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:19.00] +; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [38:38.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_no_estimate: ; SANDY: # BB#0: -; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] -; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [12:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] +; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [29:3.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: v8f32_no_estimate: ; HASWELL: # BB#0: -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 # sched: [5:1.00] +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00] ; HASWELL-NEXT: retq # sched: [1:1.00] ; ; HASWELL-NO-FMA-LABEL: v8f32_no_estimate: ; HASWELL-NO-FMA: # BB#0: -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] ; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; HASWELL-NO-FMA-NEXT: retq ; ; AVX512-LABEL: v8f32_no_estimate: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 # sched: [5:1.00] +; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00] ; AVX512-NEXT: retq # sched: [1:1.00] %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x @@ -610,27 +610,27 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { ; BTVER2-LABEL: v8f32_one_step: ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00] -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00] -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] +; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_one_step: ; SANDY: # BB#0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00] ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: v8f32_one_step: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 ; HASWELL-NEXT: retq # sched: [1:1.00] @@ -639,7 +639,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -648,7 +648,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { ; KNL-LABEL: v8f32_one_step: ; KNL: # BB#0: ; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 ; KNL-NEXT: retq # sched: [1:1.00] @@ -722,22 +722,22 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; BTVER2-LABEL: v8f32_two_step: ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00] -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:1.00] -; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:1.00] -; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00] -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00] +; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00] +; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00] +; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] +; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_two_step: ; SANDY: # BB#0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00] ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00] -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00] ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] @@ -745,12 +745,12 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: v8f32_two_step: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 @@ -762,7 +762,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] ; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -775,7 +775,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; KNL-LABEL: v8f32_two_step: ; KNL: # BB#0: ; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 @@ -786,7 +786,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; SKX-LABEL: v8f32_two_step: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 -; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 diff --git a/test/CodeGen/X86/recip-fastmath2.ll b/test/CodeGen/X86/recip-fastmath2.ll index 440a6f0bef13a..c82eab84757f4 100644 --- a/test/CodeGen/X86/recip-fastmath2.ll +++ b/test/CodeGen/X86/recip-fastmath2.ll @@ -39,8 +39,8 @@ define float @f32_no_step_2(float %x) #3 { ; SANDY-LABEL: f32_no_step_2: ; SANDY: # BB#0: ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: f32_no_step_2: ; HASWELL: # BB#0: @@ -110,12 +110,12 @@ define float @f32_one_step_2(float %x) #1 { ; SANDY: # BB#0: ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50] ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: f32_one_step_2: ; HASWELL: # BB#0: @@ -198,13 +198,13 @@ define float @f32_one_step_2_divs(float %x) #1 { ; SANDY: # BB#0: ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50] ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:1.00] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00] ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: f32_one_step_2_divs: ; HASWELL: # BB#0: @@ -305,7 +305,7 @@ define float @f32_two_step_2(float %x) #2 { ; SANDY: # BB#0: ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00] -; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50] ; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] ; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00] ; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] @@ -313,8 +313,8 @@ define float @f32_two_step_2(float %x) #2 { ; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: f32_two_step_2: ; HASWELL: # BB#0: @@ -403,19 +403,19 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 { ; ; SANDY-LABEL: v4f32_one_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00] ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50] ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: v4f32_one_step2: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] @@ -425,7 +425,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 { ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] @@ -435,7 +435,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 { ; KNL-LABEL: v4f32_one_step2: ; KNL: # BB#0: ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 ; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] @@ -501,20 +501,20 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { ; ; SANDY-LABEL: v4f32_one_step_2_divs: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00] ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50] ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00] ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: v4f32_one_step_2_divs: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] @@ -525,7 +525,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] @@ -536,7 +536,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { ; KNL-LABEL: v4f32_one_step_2_divs: ; KNL: # BB#0: ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 ; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] @@ -619,9 +619,9 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; ; SANDY-LABEL: v4f32_two_step2: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] +; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [7:3.00] ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00] -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50] ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00] ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] @@ -629,13 +629,13 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: v4f32_two_step2: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] ; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 @@ -648,7 +648,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50] -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %xmm3 # sched: [4:0.50] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [4:0.50] ; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50] ; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] @@ -662,7 +662,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; KNL-LABEL: v4f32_two_step2: ; KNL: # BB#0: ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 @@ -674,7 +674,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; SKX-LABEL: v4f32_two_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 -; SKX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 # sched: [4:0.50] +; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50] ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 @@ -729,29 +729,29 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { ; BTVER2-LABEL: v8f32_one_step2: ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00] -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00] -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] +; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_one_step2: ; SANDY: # BB#0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00] ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: v8f32_one_step2: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] @@ -761,7 +761,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] @@ -771,7 +771,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { ; KNL-LABEL: v8f32_one_step2: ; KNL: # BB#0: ; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 ; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] @@ -835,31 +835,31 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { ; BTVER2-LABEL: v8f32_one_step_2_divs: ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00] -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00] -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [7:1.00] -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] +; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [7:2.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_one_step_2_divs: ; SANDY: # BB#0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00] ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:1.00] ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: v8f32_one_step_2_divs: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00] @@ -870,7 +870,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] @@ -881,7 +881,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { ; KNL-LABEL: v8f32_one_step_2_divs: ; KNL: # BB#0: ; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 ; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00] @@ -964,23 +964,23 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; BTVER2-LABEL: v8f32_two_step2: ; BTVER2: # BB#0: ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] -; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:1.00] -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:1.00] -; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] -; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:1.00] -; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] -; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:1.00] -; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:1.00] -; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00] +; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00] +; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00] +; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00] +; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] +; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] +; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_two_step2: ; SANDY: # BB#0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00] ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00] -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50] +; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00] ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] @@ -988,13 +988,13 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: v8f32_two_step2: ; HASWELL: # BB#0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; HASWELL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] ; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 @@ -1007,7 +1007,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; HASWELL-NO-FMA: # BB#0: ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00] -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*}}(%rip), %ymm3 # sched: [5:1.00] +; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00] ; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] @@ -1021,7 +1021,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; KNL-LABEL: v8f32_two_step2: ; KNL: # BB#0: ; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] -; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] ; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 @@ -1033,7 +1033,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; SKX-LABEL: v8f32_two_step2: ; SKX: # BB#0: ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 -; SKX-NEXT: vbroadcastss {{.*}}(%rip), %ymm2 # sched: [5:1.00] +; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00] ; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] ; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 @@ -1064,13 +1064,13 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 { ; ; BTVER2-LABEL: v8f32_no_step: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_no_step: ; SANDY: # BB#0: ; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: v8f32_no_step: ; HASWELL: # BB#0: @@ -1118,15 +1118,15 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 { ; ; BTVER2-LABEL: v8f32_no_step2: ; BTVER2: # BB#0: -; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:1.00] -; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:1.00] +; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00] +; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: v8f32_no_step2: ; SANDY: # BB#0: ; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00] -; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: v8f32_no_step2: ; HASWELL: # BB#0: diff --git a/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll index ba8ff1bc1819c..3bb14c4b1cd83 100644 --- a/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll +++ b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s +; RUN: llc -lsr-filter-same-scaled-reg=false < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s ; Test case for the recoloring of broken hints. ; This is tricky to have something reasonably small to kick this optimization since ; it requires that spliting and spilling occur. diff --git a/test/CodeGen/X86/rotate4.ll b/test/CodeGen/X86/rotate4.ll index 56a7d32850569..c7117be91ab47 100644 --- a/test/CodeGen/X86/rotate4.ll +++ b/test/CodeGen/X86/rotate4.ll @@ -1,17 +1,20 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=generic | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s ; Check that we recognize this idiom for rotation too: ; a << (b & (OpSize-1)) | a >> ((0 - b) & (OpSize-1)) define i32 @rotate_left_32(i32 %a, i32 %b) { ; CHECK-LABEL: rotate_left_32: -; CHECK-NOT: and -; CHECK: roll -entry: +; CHECK: # BB#0: +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: roll %cl, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq %and = and i32 %b, 31 %shl = shl i32 %a, %and - %0 = sub i32 0, %b - %and3 = and i32 %0, 31 + %t0 = sub i32 0, %b + %and3 = and i32 %t0, 31 %shr = lshr i32 %a, %and3 %or = or i32 %shl, %shr ret i32 %or @@ -19,13 +22,15 @@ entry: define i32 @rotate_right_32(i32 %a, i32 %b) { ; CHECK-LABEL: rotate_right_32: -; CHECK-NOT: and -; CHECK: rorl -entry: +; CHECK: # BB#0: +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: rorl %cl, %edi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: retq %and = and i32 %b, 31 %shl = lshr i32 %a, %and - %0 = sub i32 0, %b - %and3 = and i32 %0, 31 + %t0 = sub i32 0, %b + %and3 = and i32 %t0, 31 %shr = shl i32 %a, %and3 %or = or i32 %shl, %shr ret i32 %or @@ -33,13 +38,15 @@ entry: define i64 @rotate_left_64(i64 %a, i64 %b) { ; CHECK-LABEL: rotate_left_64: -; CHECK-NOT: and -; CHECK: rolq -entry: +; CHECK: # BB#0: +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: rolq %cl, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq %and = and i64 %b, 63 %shl = shl i64 %a, %and - %0 = sub i64 0, %b - %and3 = and i64 %0, 63 + %t0 = sub i64 0, %b + %and3 = and i64 %t0, 63 %shr = lshr i64 %a, %and3 %or = or i64 %shl, %shr ret i64 %or @@ -47,13 +54,15 @@ entry: define i64 @rotate_right_64(i64 %a, i64 %b) { ; CHECK-LABEL: rotate_right_64: -; CHECK-NOT: and -; CHECK: rorq -entry: +; CHECK: # BB#0: +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: rorq %cl, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq %and = and i64 %b, 63 %shl = lshr i64 %a, %and - %0 = sub i64 0, %b - %and3 = and i64 %0, 63 + %t0 = sub i64 0, %b + %and3 = and i64 %t0, 63 %shr = shl i64 %a, %and3 %or = or i64 %shl, %shr ret i64 %or @@ -63,16 +72,15 @@ entry: define void @rotate_left_m32(i32 *%pa, i32 %b) { ; CHECK-LABEL: rotate_left_m32: -; CHECK-NOT: and -; CHECK: roll -; no store: -; CHECK-NOT: mov -entry: +; CHECK: # BB#0: +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: roll %cl, (%rdi) +; CHECK-NEXT: retq %a = load i32, i32* %pa, align 16 %and = and i32 %b, 31 %shl = shl i32 %a, %and - %0 = sub i32 0, %b - %and3 = and i32 %0, 31 + %t0 = sub i32 0, %b + %and3 = and i32 %t0, 31 %shr = lshr i32 %a, %and3 %or = or i32 %shl, %shr store i32 %or, i32* %pa, align 32 @@ -81,16 +89,15 @@ entry: define void @rotate_right_m32(i32 *%pa, i32 %b) { ; CHECK-LABEL: rotate_right_m32: -; CHECK-NOT: and -; CHECK: rorl -; no store: -; CHECK-NOT: mov -entry: +; CHECK: # BB#0: +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: rorl %cl, (%rdi) +; CHECK-NEXT: retq %a = load i32, i32* %pa, align 16 %and = and i32 %b, 31 %shl = lshr i32 %a, %and - %0 = sub i32 0, %b - %and3 = and i32 %0, 31 + %t0 = sub i32 0, %b + %and3 = and i32 %t0, 31 %shr = shl i32 %a, %and3 %or = or i32 %shl, %shr store i32 %or, i32* %pa, align 32 @@ -99,16 +106,15 @@ entry: define void @rotate_left_m64(i64 *%pa, i64 %b) { ; CHECK-LABEL: rotate_left_m64: -; CHECK-NOT: and -; CHECK: rolq -; no store: -; CHECK-NOT: mov -entry: +; CHECK: # BB#0: +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: rolq %cl, (%rdi) +; CHECK-NEXT: retq %a = load i64, i64* %pa, align 16 %and = and i64 %b, 63 %shl = shl i64 %a, %and - %0 = sub i64 0, %b - %and3 = and i64 %0, 63 + %t0 = sub i64 0, %b + %and3 = and i64 %t0, 63 %shr = lshr i64 %a, %and3 %or = or i64 %shl, %shr store i64 %or, i64* %pa, align 64 @@ -117,18 +123,18 @@ entry: define void @rotate_right_m64(i64 *%pa, i64 %b) { ; CHECK-LABEL: rotate_right_m64: -; CHECK-NOT: and -; CHECK: rorq -; no store: -; CHECK-NOT: mov -entry: +; CHECK: # BB#0: +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: rorq %cl, (%rdi) +; CHECK-NEXT: retq %a = load i64, i64* %pa, align 16 %and = and i64 %b, 63 %shl = lshr i64 %a, %and - %0 = sub i64 0, %b - %and3 = and i64 %0, 63 + %t0 = sub i64 0, %b + %and3 = and i64 %t0, 63 %shr = shl i64 %a, %and3 %or = or i64 %shl, %shr store i64 %or, i64* %pa, align 64 ret void } + diff --git a/test/CodeGen/X86/sbb.ll b/test/CodeGen/X86/sbb.ll index 414780b2d4e65..b6e8ebf6ed068 100644 --- a/test/CodeGen/X86/sbb.ll +++ b/test/CodeGen/X86/sbb.ll @@ -146,10 +146,8 @@ define i32 @ugt_select_neg1_or_0(i32 %x, i32 %y) nounwind { define i32 @uge_select_0_or_neg1(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: uge_select_0_or_neg1: ; CHECK: # BB#0: -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl %esi, %edi -; CHECK-NEXT: setae %al -; CHECK-NEXT: decl %eax +; CHECK-NEXT: sbbl %eax, %eax ; CHECK-NEXT: retq %cmp = icmp uge i32 %x, %y %ext = zext i1 %cmp to i32 @@ -163,10 +161,8 @@ define i32 @uge_select_0_or_neg1(i32 %x, i32 %y) nounwind { define i32 @ule_select_0_or_neg1(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: ule_select_0_or_neg1: ; CHECK: # BB#0: -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: cmpl %edi, %esi -; CHECK-NEXT: setbe %al -; CHECK-NEXT: decl %eax +; CHECK-NEXT: cmpl %esi, %edi +; CHECK-NEXT: sbbl %eax, %eax ; CHECK-NEXT: retq %cmp = icmp ule i32 %y, %x %ext = zext i1 %cmp to i32 @@ -180,10 +176,8 @@ define i32 @ule_select_0_or_neg1(i32 %x, i32 %y) nounwind { define i32 @uge_select_0_or_neg1_sub(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: uge_select_0_or_neg1_sub: ; CHECK: # BB#0: -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl %esi, %edi -; CHECK-NEXT: setae %al -; CHECK-NEXT: decl %eax +; CHECK-NEXT: sbbl %eax, %eax ; CHECK-NEXT: retq %cmp = icmp uge i32 %x, %y %ext = zext i1 %cmp to i32 @@ -191,6 +185,38 @@ define i32 @uge_select_0_or_neg1_sub(i32 %x, i32 %y) nounwind { ret i32 %sub } +; Check more sub-from-zero patterns. +; (X >u Y) ? -1 : 0 --> cmp, sbb + +define i64 @ugt_select_neg1_or_0_sub(i64 %x, i64 %y) nounwind { +; CHECK-LABEL: ugt_select_neg1_or_0_sub: +; CHECK: # BB#0: +; CHECK-NEXT: cmpq %rdi, %rsi +; CHECK-NEXT: sbbq %rax, %rax +; CHECK-NEXT: retq + %cmp = icmp ugt i64 %x, %y + %zext = zext i1 %cmp to i64 + %sub = sub i64 0, %zext + ret i64 %sub +} + +; Swap the predicate and compare operands: +; (Y <u X) ? -1 : 0 --> cmp, sbb + +define i16 @ult_select_neg1_or_0_sub(i16 %x, i16 %y) nounwind { +; CHECK-LABEL: ult_select_neg1_or_0_sub: +; CHECK: # BB#0: +; CHECK-NEXT: cmpw %di, %si +; CHECK-NEXT: sbbw %ax, %ax +; CHECK-NEXT: retq + %cmp = icmp ult i16 %y, %x + %zext = zext i1 %cmp to i16 + %sub = sub i16 0, %zext + ret i16 %sub +} + + + ; Make sure we're creating nodes with the right value types. This would crash. ; https://bugs.llvm.org/show_bug.cgi?id=33560 diff --git a/test/CodeGen/X86/select_const.ll b/test/CodeGen/X86/select_const.ll index a97e7c299e73d..0eb9bf46ffd15 100644 --- a/test/CodeGen/X86/select_const.ll +++ b/test/CodeGen/X86/select_const.ll @@ -205,6 +205,111 @@ define i32 @select_C_Cplus1_signext(i1 signext %cond) { ret i32 %sel } +; If the constants differ by a small multiplier, use LEA. +; select Cond, C1, C2 --> add (mul (zext Cond), C1-C2), C2 --> LEA C2(Cond * (C1-C2)) + +define i32 @select_lea_2(i1 zeroext %cond) { +; CHECK-LABEL: select_lea_2: +; CHECK: # BB#0: +; CHECK-NEXT: testb %dil, %dil +; CHECK-NEXT: movl $-1, %ecx +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %sel = select i1 %cond, i32 -1, i32 1 + ret i32 %sel +} + +define i64 @select_lea_3(i1 zeroext %cond) { +; CHECK-LABEL: select_lea_3: +; CHECK: # BB#0: +; CHECK-NEXT: testb %dil, %dil +; CHECK-NEXT: movl $1, %ecx +; CHECK-NEXT: movq $-2, %rax +; CHECK-NEXT: cmoveq %rcx, %rax +; CHECK-NEXT: retq + %sel = select i1 %cond, i64 -2, i64 1 + ret i64 %sel +} + +define i32 @select_lea_5(i1 zeroext %cond) { +; CHECK-LABEL: select_lea_5: +; CHECK: # BB#0: +; CHECK-NEXT: testb %dil, %dil +; CHECK-NEXT: movl $-2, %ecx +; CHECK-NEXT: movl $3, %eax +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %sel = select i1 %cond, i32 -2, i32 3 + ret i32 %sel +} + +define i64 @select_lea_9(i1 zeroext %cond) { +; CHECK-LABEL: select_lea_9: +; CHECK: # BB#0: +; CHECK-NEXT: testb %dil, %dil +; CHECK-NEXT: movl $2, %ecx +; CHECK-NEXT: movq $-7, %rax +; CHECK-NEXT: cmoveq %rcx, %rax +; CHECK-NEXT: retq + %sel = select i1 %cond, i64 -7, i64 2 + ret i64 %sel +} + + +; If the constants differ by a large power-of-2, that can be a shift of the difference plus the smaller constant. +; select Cond, C1, C2 --> add (mul (zext Cond), C1-C2), C2 + +define i8 @select_pow2_diff(i1 zeroext %cond) { +; CHECK-LABEL: select_pow2_diff: +; CHECK: # BB#0: +; CHECK-NEXT: testb %dil, %dil +; CHECK-NEXT: movb $19, %al +; CHECK-NEXT: jne .LBB22_2 +; CHECK-NEXT: # BB#1: +; CHECK-NEXT: movb $3, %al +; CHECK-NEXT: .LBB22_2: +; CHECK-NEXT: retq + %sel = select i1 %cond, i8 19, i8 3 + ret i8 %sel +} + +define i16 @select_pow2_diff_invert(i1 zeroext %cond) { +; CHECK-LABEL: select_pow2_diff_invert: +; CHECK: # BB#0: +; CHECK-NEXT: testb %dil, %dil +; CHECK-NEXT: movw $7, %cx +; CHECK-NEXT: movw $71, %ax +; CHECK-NEXT: cmovnew %cx, %ax +; CHECK-NEXT: retq + %sel = select i1 %cond, i16 7, i16 71 + ret i16 %sel +} + +define i32 @select_pow2_diff_neg(i1 zeroext %cond) { +; CHECK-LABEL: select_pow2_diff_neg: +; CHECK: # BB#0: +; CHECK-NEXT: testb %dil, %dil +; CHECK-NEXT: movl $-9, %ecx +; CHECK-NEXT: movl $-25, %eax +; CHECK-NEXT: cmovnel %ecx, %eax +; CHECK-NEXT: retq + %sel = select i1 %cond, i32 -9, i32 -25 + ret i32 %sel +} + +define i64 @select_pow2_diff_neg_invert(i1 zeroext %cond) { +; CHECK-LABEL: select_pow2_diff_neg_invert: +; CHECK: # BB#0: +; CHECK-NEXT: testb %dil, %dil +; CHECK-NEXT: movl $29, %ecx +; CHECK-NEXT: movq $-99, %rax +; CHECK-NEXT: cmoveq %rcx, %rax +; CHECK-NEXT: retq + %sel = select i1 %cond, i64 -99, i64 29 + ret i64 %sel +} + ; In general, select of 2 constants could be: ; select Cond, C1, C2 --> add (mul (zext Cond), C1-C2), C2 --> add (and (sext Cond), C1-C2), C2 @@ -263,11 +368,11 @@ define <4 x i32> @sel_constants_add_constant_vec(i1 %cond) { ; CHECK-LABEL: sel_constants_add_constant_vec: ; CHECK: # BB#0: ; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: jne .LBB22_1 +; CHECK-NEXT: jne .LBB30_1 ; CHECK-NEXT: # BB#2: ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [12,13,14,15] ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB22_1: +; CHECK-NEXT: .LBB30_1: ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [4294967293,14,4,4] ; CHECK-NEXT: retq %sel = select i1 %cond, <4 x i32> <i32 -4, i32 12, i32 1, i32 0>, <4 x i32> <i32 11, i32 11, i32 11, i32 11> @@ -279,11 +384,11 @@ define <2 x double> @sel_constants_fmul_constant_vec(i1 %cond) { ; CHECK-LABEL: sel_constants_fmul_constant_vec: ; CHECK: # BB#0: ; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: jne .LBB23_1 +; CHECK-NEXT: jne .LBB31_1 ; CHECK-NEXT: # BB#2: ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.188300e+02,3.454000e+01] ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB23_1: +; CHECK-NEXT: .LBB31_1: ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [-2.040000e+01,3.768000e+01] ; CHECK-NEXT: retq %sel = select i1 %cond, <2 x double> <double -4.0, double 12.0>, <2 x double> <double 23.3, double 11.0> diff --git a/test/CodeGen/X86/shift-codegen.ll b/test/CodeGen/X86/shift-codegen.ll index 7d52bdeb9e3ab..295a55d86a00d 100644 --- a/test/CodeGen/X86/shift-codegen.ll +++ b/test/CodeGen/X86/shift-codegen.ll @@ -1,38 +1,36 @@ -; RUN: llc < %s -relocation-model=static -march=x86 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -relocation-model=static -mtriple=i686-unknown-unknown | FileCheck %s ; This should produce two shll instructions, not any lea's. target triple = "i686-apple-darwin8" -@Y = weak global i32 0 ; <i32*> [#uses=1] -@X = weak global i32 0 ; <i32*> [#uses=2] - +@Y = weak global i32 0 +@X = weak global i32 0 define void @fn1() { ; CHECK-LABEL: fn1: -; CHECK-NOT: ret -; CHECK-NOT: lea -; CHECK: shll $3 -; CHECK-NOT: lea -; CHECK: ret - - %tmp = load i32, i32* @Y ; <i32> [#uses=1] - %tmp1 = shl i32 %tmp, 3 ; <i32> [#uses=1] - %tmp2 = load i32, i32* @X ; <i32> [#uses=1] - %tmp3 = or i32 %tmp1, %tmp2 ; <i32> [#uses=1] +; CHECK: # BB#0: +; CHECK-NEXT: movl Y, %eax +; CHECK-NEXT: shll $3, %eax +; CHECK-NEXT: orl %eax, X +; CHECK-NEXT: retl + %tmp = load i32, i32* @Y + %tmp1 = shl i32 %tmp, 3 + %tmp2 = load i32, i32* @X + %tmp3 = or i32 %tmp1, %tmp2 store i32 %tmp3, i32* @X ret void } define i32 @fn2(i32 %X, i32 %Y) { ; CHECK-LABEL: fn2: -; CHECK-NOT: ret -; CHECK-NOT: lea -; CHECK: shll $3 -; CHECK-NOT: lea -; CHECK: ret - - %tmp2 = shl i32 %Y, 3 ; <i32> [#uses=1] - %tmp4 = or i32 %tmp2, %X ; <i32> [#uses=1] +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: shll $3, %eax +; CHECK-NEXT: orl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: retl + %tmp2 = shl i32 %Y, 3 + %tmp4 = or i32 %tmp2, %X ret i32 %tmp4 } diff --git a/test/CodeGen/X86/shift-folding.ll b/test/CodeGen/X86/shift-folding.ll index 6988787089778..76cf4a41a6cbe 100644 --- a/test/CodeGen/X86/shift-folding.ll +++ b/test/CodeGen/X86/shift-folding.ll @@ -1,12 +1,13 @@ -; RUN: llc < %s -march=x86 -verify-coalescing | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -verify-coalescing | FileCheck %s define i32* @test1(i32* %P, i32 %X) { ; CHECK-LABEL: test1: -; CHECK-NOT: shrl -; CHECK-NOT: shll -; CHECK: ret - -entry: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: andl $-4, %eax +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: retl %Y = lshr i32 %X, 2 %gep.upgrd.1 = zext i32 %Y to i64 %P2 = getelementptr i32, i32* %P, i64 %gep.upgrd.1 @@ -15,11 +16,11 @@ entry: define i32* @test2(i32* %P, i32 %X) { ; CHECK-LABEL: test2: -; CHECK: shll $4 -; CHECK-NOT: shll -; CHECK: ret - -entry: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: shll $4, %eax +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: retl %Y = shl i32 %X, 2 %gep.upgrd.2 = zext i32 %Y to i64 %P2 = getelementptr i32, i32* %P, i64 %gep.upgrd.2 @@ -28,11 +29,11 @@ entry: define i32* @test3(i32* %P, i32 %X) { ; CHECK-LABEL: test3: -; CHECK-NOT: shrl -; CHECK-NOT: shll -; CHECK: ret - -entry: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: andl $-4, %eax +; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: retl %Y = ashr i32 %X, 2 %P2 = getelementptr i32, i32* %P, i32 %Y ret i32* %P2 @@ -40,25 +41,27 @@ entry: define fastcc i32 @test4(i32* %d) { ; CHECK-LABEL: test4: -; CHECK-NOT: shrl -; CHECK: ret - -entry: +; CHECK: # BB#0: +; CHECK-NEXT: movzbl 3(%ecx), %eax +; CHECK-NEXT: retl %tmp4 = load i32, i32* %d %tmp512 = lshr i32 %tmp4, 24 ret i32 %tmp512 } -define i64 @test5(i16 %i, i32* %arr) { ; Ensure that we don't fold away shifts which have multiple uses, as they are ; just re-introduced for the second use. -; CHECK-LABEL: test5: -; CHECK-NOT: shrl -; CHECK: shrl $11 -; CHECK-NOT: shrl -; CHECK: ret -entry: +define i64 @test5(i16 %i, i32* %arr) { +; CHECK-LABEL: test5: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: shrl $11, %eax +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: addl (%ecx,%eax,4), %eax +; CHECK-NEXT: setb %dl +; CHECK-NEXT: retl %i.zext = zext i16 %i to i32 %index = lshr i32 %i.zext, 11 %index.zext = zext i32 %index to i64 diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/test/CodeGen/X86/shuffle-vs-trunc-256.ll index ee8921c41a063..c84869433546b 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -37,24 +37,16 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { ; ; AVX512F-LABEL: shuffle_v32i8_to_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v16i8: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0 +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -62,11 +54,7 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { ; AVX512BW-LABEL: shuffle_v32i8_to_v16i8: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -74,12 +62,7 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { ; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8: ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi) +; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L @@ -166,11 +149,8 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { ; AVX2-LABEL: shuffle_v16i16_to_v8i16: ; AVX2: # BB#0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vmovdqa %xmm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -178,11 +158,7 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { ; AVX512F-LABEL: shuffle_v16i16_to_v8i16: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -190,42 +166,22 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { ; AVX512VL-LABEL: shuffle_v16i16_to_v8i16: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v8i16: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L @@ -293,48 +249,50 @@ define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { } define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { -; AVX-LABEL: shuffle_v8i32_to_v4i32: -; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX-NEXT: vmovaps %xmm0, (%rsi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i32_to_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vmovaps %xmm0, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_to_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v8i32_to_v4i32: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512F-NEXT: vmovaps %xmm0, (%rsi) +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_to_v4i32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512VL-NEXT: vmovaps %xmm0, (%rsi) +; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v8i32_to_v4i32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovaps (%rdi), %ymm0 -; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512BW-NEXT: vmovaps %xmm0, (%rsi) +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32: ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512BWVL-NEXT: vmovaps %xmm0, (%rsi) +; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %L @@ -413,11 +371,9 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX2-LABEL: shuffle_v32i8_to_v8i8: ; AVX2: # BB#0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovq %xmm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -425,11 +381,8 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v32i8_to_v8i8: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -437,39 +390,23 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L @@ -542,26 +479,19 @@ define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX1-LABEL: shuffle_v16i16_to_v4i16: ; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vmovq %xmm0, (%rsi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_to_v4i16: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX2-NEXT: vmovq %xmm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -569,12 +499,8 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX512F-LABEL: shuffle_v16i16_to_v4i16: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -582,31 +508,23 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L @@ -676,24 +594,19 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX1-LABEL: shuffle_v32i8_to_v4i8: ; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vmovd %xmm0, (%rsi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v32i8_to_v4i8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vmovd %xmm0, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -701,11 +614,8 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v32i8_to_v4i8: ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vmovd %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -713,30 +623,23 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L @@ -802,3 +705,73 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { store <4 x i8> %strided.vec, <4 x i8>* %S ret void } + +; In this case not all elements are collected from the same source vector, so +; the resulting BUILD_VECTOR should not be combined to a truncate. +define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { +; AVX1-LABEL: negative: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: negative: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: negative: +; AVX512F: # BB#0: +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: negative: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: negative: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: negative: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] +; AVX512BWVL-NEXT: movl $65537, %eax # imm = 0x10001 +; AVX512BWVL-NEXT: kmovd %eax, %k1 +; AVX512BWVL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX512BWVL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + %w0 = extractelement <32 x i8> %w, i32 0 + %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0 + ret <16 x i8> %merged +} diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll index a3ba589758009..69155b5cc565a 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -11,49 +11,37 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v64i8_to_v32i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovsxwd 32(%rdi), %zmm1 +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v64i8_to_v32i8: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0 +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vpmovsxwd 32(%rdi), %zmm1 +; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu8 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512BWVL-NEXT: vmovdqu %ymm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L @@ -106,54 +94,12 @@ define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind { } define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind { -; AVX512F-LABEL: shuffle_v32i16_to_v16i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i16_to_v16i16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i16_to_v16i16: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16: -; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,8,10,12,14,24,26,28,30] -; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3] -; AVX512BWVL-NEXT: vmovdqu %ymm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v32i16_to_v16i16: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> store <16 x i16> %strided.vec, <16 x i16>* %S @@ -177,11 +123,8 @@ define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind { define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind { ; AVX512-LABEL: shuffle_v16i32_to_v8i32: ; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %L @@ -205,127 +148,12 @@ define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind { } define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v16i8: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v16i8: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v16i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 -; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $0, %xmm0, %ecx -; AVX512BW-NEXT: vmovd %ecx, %xmm1 -; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax -; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax -; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2 -; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8: -; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu8 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vpextrb $4, %xmm0, %eax -; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %ecx -; AVX512BWVL-NEXT: vmovd %ecx, %xmm1 -; AVX512BWVL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $12, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2 -; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $4, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $12, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $4, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $12, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $4, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $12, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 -; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v16i8: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> store <16 x i8> %strided.vec, <16 x i8>* %S @@ -347,99 +175,12 @@ define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind { } define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind { -; AVX512F-LABEL: shuffle_v32i16_to_v8i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i16_to_v8i16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i16_to_v8i16: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 -; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; AVX512BW-NEXT: vpextrw $4, %xmm0, %eax -; AVX512BW-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2 -; AVX512BW-NEXT: vmovd %xmm2, %eax -; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrw $4, %xmm2, %eax -; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512BW-NEXT: vmovd %xmm2, %eax -; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrw $4, %xmm2, %eax -; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrw $4, %xmm0, %eax -; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16: -; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; AVX512BWVL-NEXT: vpextrw $4, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2 -; AVX512BWVL-NEXT: vmovd %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrw $4, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512BWVL-NEXT: vmovd %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrw $4, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512BWVL-NEXT: vmovd %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrw $4, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 -; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v32i16_to_v8i16: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> store <8 x i16> %strided.vec, <8 x i16>* %S @@ -461,95 +202,12 @@ define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind { } define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v8i8: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v8i8: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v8i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512BW-NEXT: vpextrb $8, %xmm1, %r8d -; AVX512BW-NEXT: vpextrb $0, %xmm1, %r9d -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512BW-NEXT: vpextrb $8, %xmm1, %r10d -; AVX512BW-NEXT: vpextrb $0, %xmm1, %r11d -; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm1 -; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax -; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx -; AVX512BW-NEXT: vpextrb $8, %xmm0, %edx -; AVX512BW-NEXT: vpextrb $0, %xmm0, %edi -; AVX512BW-NEXT: vmovd %edi, %xmm0 -; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; AVX512BW-NEXT: vpinsrb $4, %r11d, %xmm0, %xmm0 -; AVX512BW-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0 -; AVX512BW-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0 -; AVX512BW-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8: -; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu8 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax -; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %ecx -; AVX512BWVL-NEXT: vmovd %ecx, %xmm1 -; AVX512BWVL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2 -; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $8, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v8i8: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56> store <8 x i8> %strided.vec, <8 x i8>* %S diff --git a/test/CodeGen/X86/sink-blockfreq.ll b/test/CodeGen/X86/sink-blockfreq.ll index 5436cf248bd51..d0b8972cee503 100644 --- a/test/CodeGen/X86/sink-blockfreq.ll +++ b/test/CodeGen/X86/sink-blockfreq.ll @@ -2,7 +2,7 @@ ; RUN: llc -disable-preheader-prot=true -disable-machine-licm -machine-sink-bfi=false -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_NOBFI ; Test that by changing BlockFrequencyInfo we change the order in which -; machine-sink looks for sucessor blocks. By not using BFI, both G and B +; machine-sink looks for successor blocks. By not using BFI, both G and B ; have the same loop depth and no instructions is sinked - B is selected but ; can't be used as to avoid breaking a non profitable critical edge. By using ; BFI, "mul" is sinked into the less frequent block G. diff --git a/test/CodeGen/X86/sink-gep-before-mem-inst.ll b/test/CodeGen/X86/sink-gep-before-mem-inst.ll new file mode 100644 index 0000000000000..b9c94adda9936 --- /dev/null +++ b/test/CodeGen/X86/sink-gep-before-mem-inst.ll @@ -0,0 +1,25 @@ +; RUN: opt < %s -S -codegenprepare -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + +define i64 @test.after(i8 addrspace(1)* readonly align 8) { +; CHECK-LABEL: test.after +; CHECK: sunkaddr +entry: + %.0 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 8 + %addr = bitcast i8 addrspace(1)* %.0 to i32 addrspace(1)* + br label %header + +header: + %addr.in.loop = phi i32 addrspace(1)* [ %addr, %entry ], [ %addr.after, %header ] + %local_2_ = phi i64 [ 0, %entry ], [ %.9, %header ] + %.7 = load i32, i32 addrspace(1)* %addr.in.loop, align 8 + fence acquire + %.1 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 8 + %addr.after = bitcast i8 addrspace(1)* %.1 to i32 addrspace(1)* + %.8 = sext i32 %.7 to i64 + %.9 = add i64 %local_2_, %.8 + %not. = icmp sgt i64 %.9, 999 + br i1 %not., label %exit, label %header + +exit: + ret i64 %.9 +} diff --git a/test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll b/test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll new file mode 100644 index 0000000000000..0461ee809efb5 --- /dev/null +++ b/test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll @@ -0,0 +1,55 @@ +; RUN: llc < %s -mtriple=x86_64-linux-android -mattr=+mmx -enable-legalize-types-checking | FileCheck %s +; +; D31946 +; Check that we dont end up with the ""LLVM ERROR: Cannot select" error. +; Additionally ensure that the output code actually put fp128 values in SSE registers. + +declare fp128 @llvm.fabs.f128(fp128) +declare fp128 @llvm.copysign.f128(fp128, fp128) + +define fp128 @TestSelect(fp128 %a, fp128 %b) { + %cmp = fcmp ogt fp128 %a, %b + %sub = fsub fp128 %a, %b + %res = select i1 %cmp, fp128 %sub, fp128 0xL00000000000000000000000000000000 + ret fp128 %res +; CHECK-LABEL: TestSelect: +; CHECK movaps 16(%rsp), %xmm1 +; CHECK-NEXT callq __subtf3 +; CHECK-NEXT testl %ebx, %ebx +; CHECK-NEXT jg .LBB0_2 +; CHECK-NEXT # BB#1: +; CHECK-NEXT movaps .LCPI0_0(%rip), %xmm0 +; CHECK-NEXT .LBB0_2: +; CHECK-NEXT addq $32, %rsp +; CHECK-NEXT popq %rbx +; CHECK-NEXT retq +} + +define fp128 @TestFabs(fp128 %a) { + %res = call fp128 @llvm.fabs.f128(fp128 %a) + ret fp128 %res +; CHECK-LABEL: TestFabs: +; CHECK andps .LCPI1_0(%rip), %xmm0 +; CHECK-NEXT retq +} + +define fp128 @TestCopysign(fp128 %a, fp128 %b) { + %res = call fp128 @llvm.copysign.f128(fp128 %a, fp128 %b) + ret fp128 %res +; CHECK-LABEL: TestCopysign: +; CHECK andps .LCPI2_1(%rip), %xmm0 +; CHECK-NEXT orps %xmm1, %xmm0 +; CHECK-NEXT retq +} + +define fp128 @TestFneg(fp128 %a) { + %mul = fmul fp128 %a, %a + %res = fsub fp128 0xL00000000000000008000000000000000, %mul + ret fp128 %res +; CHECK-LABEL: TestFneg: +; CHECK movaps %xmm0, %xmm1 +; CHECK-NEXT callq __multf3 +; CHECK-NEXT xorps .LCPI3_0(%rip), %xmm0 +; CHECK-NEXT popq %rax +; CHECK-NEXT retq +} diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll index 52e6b61aedfe8..c41acd43b3ab6 100644 --- a/test/CodeGen/X86/sse-schedule.ll +++ b/test/CodeGen/X86/sse-schedule.ll @@ -31,8 +31,8 @@ define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; SANDY-LABEL: test_addps: ; SANDY: # BB#0: ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_addps: ; HASWELL: # BB#0: @@ -73,8 +73,8 @@ define float @test_addss(float %a0, float %a1, float *%a2) { ; SANDY-LABEL: test_addss: ; SANDY: # BB#0: ; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_addss: ; HASWELL: # BB#0: @@ -122,9 +122,9 @@ define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; ; SANDY-LABEL: test_andps: ; SANDY: # BB#0: -; SANDY-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_andps: ; HASWELL: # BB#0: @@ -176,9 +176,9 @@ define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; ; SANDY-LABEL: test_andnotps: ; SANDY: # BB#0: -; SANDY-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_andnotps: ; HASWELL: # BB#0: @@ -228,9 +228,9 @@ define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; SANDY-LABEL: test_cmpps: ; SANDY: # BB#0: ; SANDY-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00] -; SANDY-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cmpps: ; HASWELL: # BB#0: @@ -277,7 +277,7 @@ define float @test_cmpss(float %a0, float %a1, float *%a2) { ; SANDY: # BB#0: ; SANDY-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cmpss: ; HASWELL: # BB#0: @@ -347,16 +347,16 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { ; SANDY-LABEL: test_comiss: ; SANDY: # BB#0: ; SANDY-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: setnp %al # sched: [1:0.33] -; SANDY-NEXT: sete %cl # sched: [1:0.33] +; SANDY-NEXT: setnp %al # sched: [1:1.00] +; SANDY-NEXT: sete %cl # sched: [1:1.00] ; SANDY-NEXT: andb %al, %cl # sched: [1:0.33] ; SANDY-NEXT: vcomiss (%rdi), %xmm0 # sched: [7:1.00] -; SANDY-NEXT: setnp %al # sched: [1:0.33] -; SANDY-NEXT: sete %dl # sched: [1:0.33] +; SANDY-NEXT: setnp %al # sched: [1:1.00] +; SANDY-NEXT: sete %dl # sched: [1:1.00] ; SANDY-NEXT: andb %al, %dl # sched: [1:0.33] ; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33] ; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_comiss: ; HASWELL: # BB#0: @@ -417,10 +417,10 @@ define float @test_cvtsi2ss(i32 %a0, i32 *%a1) { ; ; SANDY-LABEL: test_cvtsi2ss: ; SANDY: # BB#0: -; SANDY-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00] -; SANDY-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:2.00] +; SANDY-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [10:1.00] ; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtsi2ss: ; HASWELL: # BB#0: @@ -466,10 +466,10 @@ define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) { ; ; SANDY-LABEL: test_cvtsi2ssq: ; SANDY: # BB#0: -; SANDY-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00] -; SANDY-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00] +; SANDY-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [10:1.00] ; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtsi2ssq: ; HASWELL: # BB#0: @@ -515,10 +515,10 @@ define i32 @test_cvtss2si(float %a0, float *%a1) { ; ; SANDY-LABEL: test_cvtss2si: ; SANDY: # BB#0: -; SANDY-NEXT: vcvtss2si %xmm0, %ecx # sched: [3:1.00] -; SANDY-NEXT: vcvtss2si (%rdi), %eax # sched: [7:1.00] +; SANDY-NEXT: vcvtss2si %xmm0, %ecx # sched: [5:1.00] +; SANDY-NEXT: vcvtss2si (%rdi), %eax # sched: [10:1.00] ; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtss2si: ; HASWELL: # BB#0: @@ -567,10 +567,10 @@ define i64 @test_cvtss2siq(float %a0, float *%a1) { ; ; SANDY-LABEL: test_cvtss2siq: ; SANDY: # BB#0: -; SANDY-NEXT: vcvtss2si %xmm0, %rcx # sched: [3:1.00] -; SANDY-NEXT: vcvtss2si (%rdi), %rax # sched: [7:1.00] +; SANDY-NEXT: vcvtss2si %xmm0, %rcx # sched: [5:1.00] +; SANDY-NEXT: vcvtss2si (%rdi), %rax # sched: [10:1.00] ; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtss2siq: ; HASWELL: # BB#0: @@ -619,10 +619,10 @@ define i32 @test_cvttss2si(float %a0, float *%a1) { ; ; SANDY-LABEL: test_cvttss2si: ; SANDY: # BB#0: -; SANDY-NEXT: vcvttss2si %xmm0, %ecx # sched: [3:1.00] -; SANDY-NEXT: vcvttss2si (%rdi), %eax # sched: [7:1.00] +; SANDY-NEXT: vcvttss2si %xmm0, %ecx # sched: [5:1.00] +; SANDY-NEXT: vcvttss2si (%rdi), %eax # sched: [10:1.00] ; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvttss2si: ; HASWELL: # BB#0: @@ -668,10 +668,10 @@ define i64 @test_cvttss2siq(float %a0, float *%a1) { ; ; SANDY-LABEL: test_cvttss2siq: ; SANDY: # BB#0: -; SANDY-NEXT: vcvttss2si %xmm0, %rcx # sched: [3:1.00] -; SANDY-NEXT: vcvttss2si (%rdi), %rax # sched: [7:1.00] +; SANDY-NEXT: vcvttss2si %xmm0, %rcx # sched: [5:1.00] +; SANDY-NEXT: vcvttss2si (%rdi), %rax # sched: [10:1.00] ; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvttss2siq: ; HASWELL: # BB#0: @@ -714,9 +714,9 @@ define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; ; SANDY-LABEL: test_divps: ; SANDY: # BB#0: -; SANDY-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [12:1.00] -; SANDY-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [16:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [14:1.00] +; SANDY-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [20:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_divps: ; HASWELL: # BB#0: @@ -756,9 +756,9 @@ define float @test_divss(float %a0, float %a1, float *%a2) { ; ; SANDY-LABEL: test_divss: ; SANDY: # BB#0: -; SANDY-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [12:1.00] -; SANDY-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [14:1.00] +; SANDY-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [20:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_divss: ; HASWELL: # BB#0: @@ -799,8 +799,8 @@ define void @test_ldmxcsr(i32 %a0) { ; SANDY-LABEL: test_ldmxcsr: ; SANDY: # BB#0: ; SANDY-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00] -; SANDY-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [4:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_ldmxcsr: ; HASWELL: # BB#0: @@ -843,8 +843,8 @@ define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; SANDY-LABEL: test_maxps: ; SANDY: # BB#0: ; SANDY-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_maxps: ; HASWELL: # BB#0: @@ -886,8 +886,8 @@ define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; SANDY-LABEL: test_maxss: ; SANDY: # BB#0: ; SANDY-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_maxss: ; HASWELL: # BB#0: @@ -929,8 +929,8 @@ define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; SANDY-LABEL: test_minps: ; SANDY: # BB#0: ; SANDY-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_minps: ; HASWELL: # BB#0: @@ -972,8 +972,8 @@ define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; SANDY-LABEL: test_minss: ; SANDY: # BB#0: ; SANDY-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_minss: ; HASWELL: # BB#0: @@ -1017,10 +1017,10 @@ define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) { ; ; SANDY-LABEL: test_movaps: ; SANDY: # BB#0: -; SANDY-NEXT: vmovaps (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vmovaps (%rdi), %xmm0 # sched: [6:0.50] ; SANDY-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovaps %xmm0, (%rsi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movaps: ; HASWELL: # BB#0: @@ -1068,7 +1068,7 @@ define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) { ; SANDY-LABEL: test_movhlps: ; SANDY: # BB#0: ; SANDY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movhlps: ; HASWELL: # BB#0: @@ -1111,10 +1111,10 @@ define void @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) { ; ; SANDY-LABEL: test_movhps: ; SANDY: # BB#0: -; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00] +; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movhps: ; HASWELL: # BB#0: @@ -1164,7 +1164,7 @@ define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) { ; SANDY: # BB#0: ; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movlhps: ; HASWELL: # BB#0: @@ -1206,10 +1206,10 @@ define void @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) { ; ; SANDY-LABEL: test_movlps: ; SANDY: # BB#0: -; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00] +; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovlps %xmm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movlps: ; HASWELL: # BB#0: @@ -1254,8 +1254,8 @@ define i32 @test_movmskps(<4 x float> %a0) { ; ; SANDY-LABEL: test_movmskps: ; SANDY: # BB#0: -; SANDY-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovmskps %xmm0, %eax # sched: [2:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movmskps: ; HASWELL: # BB#0: @@ -1295,8 +1295,8 @@ define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) { ; ; SANDY-LABEL: test_movntps: ; SANDY: # BB#0: -; SANDY-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovntps %xmm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movntps: ; HASWELL: # BB#0: @@ -1335,10 +1335,10 @@ define void @test_movss_mem(float* %a0, float* %a1) { ; ; SANDY-LABEL: test_movss_mem: ; SANDY: # BB#0: -; SANDY-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50] ; SANDY-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovss %xmm0, (%rsi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movss_mem: ; HASWELL: # BB#0: @@ -1383,8 +1383,8 @@ define <4 x float> @test_movss_reg(<4 x float> %a0, <4 x float> %a1) { ; ; SANDY-LABEL: test_movss_reg: ; SANDY: # BB#0: -; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movss_reg: ; HASWELL: # BB#0: @@ -1423,10 +1423,10 @@ define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) { ; ; SANDY-LABEL: test_movups: ; SANDY: # BB#0: -; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50] ; SANDY-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movups: ; HASWELL: # BB#0: @@ -1469,8 +1469,8 @@ define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; SANDY-LABEL: test_mulps: ; SANDY: # BB#0: ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_mulps: ; HASWELL: # BB#0: @@ -1511,8 +1511,8 @@ define float @test_mulss(float %a0, float %a1, float *%a2) { ; SANDY-LABEL: test_mulss: ; SANDY: # BB#0: ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_mulss: ; HASWELL: # BB#0: @@ -1560,9 +1560,9 @@ define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2 ; ; SANDY-LABEL: test_orps: ; SANDY: # BB#0: -; SANDY-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_orps: ; HASWELL: # BB#0: @@ -1609,8 +1609,8 @@ define void @test_prefetchnta(i8* %a0) { ; ; SANDY-LABEL: test_prefetchnta: ; SANDY: # BB#0: -; SANDY-NEXT: prefetchnta (%rdi) # sched: [4:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: prefetchnta (%rdi) # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_prefetchnta: ; HASWELL: # BB#0: @@ -1652,10 +1652,10 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) { ; ; SANDY-LABEL: test_rcpps: ; SANDY: # BB#0: -; SANDY-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: vrcpps (%rdi), %xmm1 # sched: [9:1.00] +; SANDY-NEXT: vrcpps %xmm0, %xmm0 # sched: [7:3.00] +; SANDY-NEXT: vrcpps (%rdi), %xmm1 # sched: [11:1.00] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_rcpps: ; HASWELL: # BB#0: @@ -1708,10 +1708,10 @@ define <4 x float> @test_rcpss(float %a0, float *%a1) { ; SANDY-LABEL: test_rcpss: ; SANDY: # BB#0: ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50] ; SANDY-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [9:1.00] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_rcpss: ; HASWELL: # BB#0: @@ -1765,9 +1765,9 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) { ; SANDY-LABEL: test_rsqrtps: ; SANDY: # BB#0: ; SANDY-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [9:1.00] +; SANDY-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [11:1.00] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_rsqrtps: ; HASWELL: # BB#0: @@ -1819,11 +1819,11 @@ define <4 x float> @test_rsqrtss(float %a0, float *%a1) { ; ; SANDY-LABEL: test_rsqrtss: ; SANDY: # BB#0: -; SANDY-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] -; SANDY-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [9:1.00] +; SANDY-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50] +; SANDY-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_rsqrtss: ; HASWELL: # BB#0: @@ -1875,7 +1875,7 @@ define void @test_sfence() { ; SANDY-LABEL: test_sfence: ; SANDY: # BB#0: ; SANDY-NEXT: sfence # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_sfence: ; HASWELL: # BB#0: @@ -1917,8 +1917,8 @@ define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *% ; SANDY-LABEL: test_shufps: ; SANDY: # BB#0: ; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00] -; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [7:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_shufps: ; HASWELL: # BB#0: @@ -1962,10 +1962,10 @@ define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) { ; ; SANDY-LABEL: test_sqrtps: ; SANDY: # BB#0: -; SANDY-NEXT: vsqrtps %xmm0, %xmm0 # sched: [15:1.00] -; SANDY-NEXT: vsqrtps (%rdi), %xmm1 # sched: [19:1.00] +; SANDY-NEXT: vsqrtps %xmm0, %xmm0 # sched: [14:1.00] +; SANDY-NEXT: vsqrtps (%rdi), %xmm1 # sched: [20:1.00] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_sqrtps: ; HASWELL: # BB#0: @@ -2017,11 +2017,11 @@ define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) { ; ; SANDY-LABEL: test_sqrtss: ; SANDY: # BB#0: -; SANDY-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [19:1.00] -; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50] -; SANDY-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [19:1.00] +; SANDY-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [114:1.00] +; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [6:0.50] +; SANDY-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [114:1.00] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_sqrtss: ; HASWELL: # BB#0: @@ -2067,9 +2067,9 @@ define i32 @test_stmxcsr() { ; ; SANDY-LABEL: test_stmxcsr: ; SANDY: # BB#0: -; SANDY-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00] -; SANDY-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [4:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00] +; SANDY-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_stmxcsr: ; HASWELL: # BB#0: @@ -2112,8 +2112,8 @@ define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; SANDY-LABEL: test_subps: ; SANDY: # BB#0: ; SANDY-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_subps: ; HASWELL: # BB#0: @@ -2154,8 +2154,8 @@ define float @test_subss(float %a0, float %a1, float *%a2) { ; SANDY-LABEL: test_subss: ; SANDY: # BB#0: ; SANDY-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_subss: ; HASWELL: # BB#0: @@ -2220,16 +2220,16 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { ; SANDY-LABEL: test_ucomiss: ; SANDY: # BB#0: ; SANDY-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: setnp %al # sched: [1:0.33] -; SANDY-NEXT: sete %cl # sched: [1:0.33] +; SANDY-NEXT: setnp %al # sched: [1:1.00] +; SANDY-NEXT: sete %cl # sched: [1:1.00] ; SANDY-NEXT: andb %al, %cl # sched: [1:0.33] ; SANDY-NEXT: vucomiss (%rdi), %xmm0 # sched: [7:1.00] -; SANDY-NEXT: setnp %al # sched: [1:0.33] -; SANDY-NEXT: sete %dl # sched: [1:0.33] +; SANDY-NEXT: setnp %al # sched: [1:1.00] +; SANDY-NEXT: sete %dl # sched: [1:1.00] ; SANDY-NEXT: andb %al, %dl # sched: [1:0.33] ; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33] ; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_ucomiss: ; HASWELL: # BB#0: @@ -2292,8 +2292,8 @@ define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; SANDY-LABEL: test_unpckhps: ; SANDY: # BB#0: ; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] -; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_unpckhps: ; HASWELL: # BB#0: @@ -2338,8 +2338,8 @@ define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; SANDY-LABEL: test_unpcklps: ; SANDY: # BB#0: ; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] -; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_unpcklps: ; HASWELL: # BB#0: @@ -2387,9 +2387,9 @@ define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a ; ; SANDY-LABEL: test_xorps: ; SANDY: # BB#0: -; SANDY-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_xorps: ; HASWELL: # BB#0: diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll index 14c155c8c6c09..3c36b21381390 100644 --- a/test/CodeGen/X86/sse2-schedule.ll +++ b/test/CodeGen/X86/sse2-schedule.ll @@ -31,8 +31,8 @@ define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; SANDY-LABEL: test_addpd: ; SANDY: # BB#0: ; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_addpd: ; HASWELL: # BB#0: @@ -73,8 +73,8 @@ define double @test_addsd(double %a0, double %a1, double *%a2) { ; SANDY-LABEL: test_addsd: ; SANDY: # BB#0: ; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_addsd: ; HASWELL: # BB#0: @@ -117,10 +117,10 @@ define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; ; SANDY-LABEL: test_andpd: ; SANDY: # BB#0: -; SANDY-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_andpd: ; HASWELL: # BB#0: @@ -170,10 +170,10 @@ define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; ; SANDY-LABEL: test_andnotpd: ; SANDY: # BB#0: -; SANDY-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_andnotpd: ; HASWELL: # BB#0: @@ -226,9 +226,9 @@ define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; SANDY-LABEL: test_cmppd: ; SANDY: # BB#0: ; SANDY-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00] -; SANDY-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cmppd: ; HASWELL: # BB#0: @@ -275,7 +275,7 @@ define double @test_cmpsd(double %a0, double %a1, double *%a2) { ; SANDY: # BB#0: ; SANDY-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cmpsd: ; HASWELL: # BB#0: @@ -345,16 +345,16 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; SANDY-LABEL: test_comisd: ; SANDY: # BB#0: ; SANDY-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: setnp %al # sched: [1:0.33] -; SANDY-NEXT: sete %cl # sched: [1:0.33] +; SANDY-NEXT: setnp %al # sched: [1:1.00] +; SANDY-NEXT: sete %cl # sched: [1:1.00] ; SANDY-NEXT: andb %al, %cl # sched: [1:0.33] ; SANDY-NEXT: vcomisd (%rdi), %xmm0 # sched: [7:1.00] -; SANDY-NEXT: setnp %al # sched: [1:0.33] -; SANDY-NEXT: sete %dl # sched: [1:0.33] +; SANDY-NEXT: setnp %al # sched: [1:1.00] +; SANDY-NEXT: sete %dl # sched: [1:1.00] ; SANDY-NEXT: andb %al, %dl # sched: [1:0.33] ; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33] ; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_comisd: ; HASWELL: # BB#0: @@ -416,9 +416,9 @@ define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) { ; SANDY-LABEL: test_cvtdq2pd: ; SANDY: # BB#0: ; SANDY-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] -; SANDY-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [10:1.00] ; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtdq2pd: ; HASWELL: # BB#0: @@ -467,10 +467,10 @@ define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) { ; ; SANDY-LABEL: test_cvtdq2ps: ; SANDY: # BB#0: -; SANDY-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00] -; SANDY-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [9:1.00] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtdq2ps: ; HASWELL: # BB#0: @@ -517,10 +517,10 @@ define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) { ; ; SANDY-LABEL: test_cvtpd2dq: ; SANDY: # BB#0: -; SANDY-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00] +; SANDY-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [10:1.00] ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtpd2dq: ; HASWELL: # BB#0: @@ -568,10 +568,10 @@ define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) { ; ; SANDY-LABEL: test_cvtpd2ps: ; SANDY: # BB#0: -; SANDY-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00] +; SANDY-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [10:1.00] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtpd2ps: ; HASWELL: # BB#0: @@ -620,9 +620,9 @@ define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) { ; SANDY-LABEL: test_cvtps2dq: ; SANDY: # BB#0: ; SANDY-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [9:1.00] ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtps2dq: ; HASWELL: # BB#0: @@ -670,10 +670,10 @@ define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) { ; ; SANDY-LABEL: test_cvtps2pd: ; SANDY: # BB#0: -; SANDY-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00] ; SANDY-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [7:1.00] ; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtps2pd: ; HASWELL: # BB#0: @@ -724,7 +724,7 @@ define i32 @test_cvtsd2si(double %a0, double *%a1) { ; SANDY-NEXT: vcvtsd2si %xmm0, %ecx # sched: [3:1.00] ; SANDY-NEXT: vcvtsd2si (%rdi), %eax # sched: [7:1.00] ; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtsd2si: ; HASWELL: # BB#0: @@ -773,10 +773,10 @@ define i64 @test_cvtsd2siq(double %a0, double *%a1) { ; ; SANDY-LABEL: test_cvtsd2siq: ; SANDY: # BB#0: -; SANDY-NEXT: vcvtsd2si %xmm0, %rcx # sched: [3:1.00] -; SANDY-NEXT: vcvtsd2si (%rdi), %rax # sched: [7:1.00] +; SANDY-NEXT: vcvtsd2si %xmm0, %rcx # sched: [5:1.00] +; SANDY-NEXT: vcvtsd2si (%rdi), %rax # sched: [10:1.00] ; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtsd2siq: ; HASWELL: # BB#0: @@ -830,10 +830,10 @@ define float @test_cvtsd2ss(double %a0, double *%a1) { ; SANDY-LABEL: test_cvtsd2ss: ; SANDY: # BB#0: ; SANDY-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50] +; SANDY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50] ; SANDY-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00] ; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtsd2ss: ; HASWELL: # BB#0: @@ -882,9 +882,9 @@ define double @test_cvtsi2sd(i32 %a0, i32 *%a1) { ; SANDY-LABEL: test_cvtsi2sd: ; SANDY: # BB#0: ; SANDY-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00] -; SANDY-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [9:1.00] ; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtsi2sd: ; HASWELL: # BB#0: @@ -931,9 +931,9 @@ define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) { ; SANDY-LABEL: test_cvtsi2sdq: ; SANDY: # BB#0: ; SANDY-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00] -; SANDY-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [9:1.00] ; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtsi2sdq: ; HASWELL: # BB#0: @@ -985,11 +985,11 @@ define double @test_cvtss2sd(float %a0, float *%a1) { ; ; SANDY-LABEL: test_cvtss2sd: ; SANDY: # BB#0: -; SANDY-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] -; SANDY-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00] +; SANDY-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50] +; SANDY-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvtss2sd: ; HASWELL: # BB#0: @@ -1038,10 +1038,10 @@ define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) { ; ; SANDY-LABEL: test_cvttpd2dq: ; SANDY: # BB#0: -; SANDY-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00] +; SANDY-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [10:1.00] ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvttpd2dq: ; HASWELL: # BB#0: @@ -1091,9 +1091,9 @@ define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) { ; SANDY-LABEL: test_cvttps2dq: ; SANDY: # BB#0: ; SANDY-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [9:1.00] ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvttps2dq: ; HASWELL: # BB#0: @@ -1139,10 +1139,10 @@ define i32 @test_cvttsd2si(double %a0, double *%a1) { ; ; SANDY-LABEL: test_cvttsd2si: ; SANDY: # BB#0: -; SANDY-NEXT: vcvttsd2si %xmm0, %ecx # sched: [3:1.00] +; SANDY-NEXT: vcvttsd2si %xmm0, %ecx # sched: [5:1.00] ; SANDY-NEXT: vcvttsd2si (%rdi), %eax # sched: [7:1.00] ; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvttsd2si: ; HASWELL: # BB#0: @@ -1188,10 +1188,10 @@ define i64 @test_cvttsd2siq(double %a0, double *%a1) { ; ; SANDY-LABEL: test_cvttsd2siq: ; SANDY: # BB#0: -; SANDY-NEXT: vcvttsd2si %xmm0, %rcx # sched: [3:1.00] -; SANDY-NEXT: vcvttsd2si (%rdi), %rax # sched: [7:1.00] +; SANDY-NEXT: vcvttsd2si %xmm0, %rcx # sched: [5:1.00] +; SANDY-NEXT: vcvttsd2si (%rdi), %rax # sched: [10:1.00] ; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_cvttsd2siq: ; HASWELL: # BB#0: @@ -1234,9 +1234,9 @@ define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; ; SANDY-LABEL: test_divpd: ; SANDY: # BB#0: -; SANDY-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00] -; SANDY-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [22:1.00] +; SANDY-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [28:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_divpd: ; HASWELL: # BB#0: @@ -1276,9 +1276,9 @@ define double @test_divsd(double %a0, double %a1, double *%a2) { ; ; SANDY-LABEL: test_divsd: ; SANDY: # BB#0: -; SANDY-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00] -; SANDY-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [22:1.00] +; SANDY-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [28:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_divsd: ; HASWELL: # BB#0: @@ -1322,7 +1322,7 @@ define void @test_lfence() { ; SANDY-LABEL: test_lfence: ; SANDY: # BB#0: ; SANDY-NEXT: lfence # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_lfence: ; HASWELL: # BB#0: @@ -1363,7 +1363,7 @@ define void @test_mfence() { ; SANDY-LABEL: test_mfence: ; SANDY: # BB#0: ; SANDY-NEXT: mfence # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_mfence: ; HASWELL: # BB#0: @@ -1402,7 +1402,7 @@ define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) { ; SANDY-LABEL: test_maskmovdqu: ; SANDY: # BB#0: ; SANDY-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_maskmovdqu: ; HASWELL: # BB#0: @@ -1440,8 +1440,8 @@ define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; SANDY-LABEL: test_maxpd: ; SANDY: # BB#0: ; SANDY-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_maxpd: ; HASWELL: # BB#0: @@ -1483,8 +1483,8 @@ define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; SANDY-LABEL: test_maxsd: ; SANDY: # BB#0: ; SANDY-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_maxsd: ; HASWELL: # BB#0: @@ -1526,8 +1526,8 @@ define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; SANDY-LABEL: test_minpd: ; SANDY: # BB#0: ; SANDY-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_minpd: ; HASWELL: # BB#0: @@ -1569,8 +1569,8 @@ define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; SANDY-LABEL: test_minsd: ; SANDY: # BB#0: ; SANDY-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_minsd: ; HASWELL: # BB#0: @@ -1614,10 +1614,10 @@ define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) { ; ; SANDY-LABEL: test_movapd: ; SANDY: # BB#0: -; SANDY-NEXT: vmovapd (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vmovapd (%rdi), %xmm0 # sched: [6:0.50] ; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovapd %xmm0, (%rsi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movapd: ; HASWELL: # BB#0: @@ -1662,10 +1662,10 @@ define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) { ; ; SANDY-LABEL: test_movdqa: ; SANDY: # BB#0: -; SANDY-NEXT: vmovdqa (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50] ; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovdqa %xmm0, (%rsi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movdqa: ; HASWELL: # BB#0: @@ -1710,10 +1710,10 @@ define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) { ; ; SANDY-LABEL: test_movdqu: ; SANDY: # BB#0: -; SANDY-NEXT: vmovdqu (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vmovdqu (%rdi), %xmm0 # sched: [6:0.50] ; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovdqu %xmm0, (%rsi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movdqu: ; HASWELL: # BB#0: @@ -1768,12 +1768,12 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) { ; SANDY-LABEL: test_movd: ; SANDY: # BB#0: ; SANDY-NEXT: vmovd %edi, %xmm1 # sched: [1:0.33] -; SANDY-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50] ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] ; SANDY-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vmovd %xmm0, %eax # sched: [1:0.33] -; SANDY-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovd %xmm0, %eax # sched: [2:1.00] +; SANDY-NEXT: vmovd %xmm1, (%rsi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movd: ; HASWELL: # BB#0: @@ -1838,13 +1838,13 @@ define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) { ; ; SANDY-LABEL: test_movd_64: ; SANDY: # BB#0: -; SANDY-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.33] -; SANDY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50] +; SANDY-NEXT: vmovq %rdi, %xmm1 # sched: [1:1.00] +; SANDY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [6:0.50] ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50] ; SANDY-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vmovq %xmm0, %rax # sched: [1:0.33] -; SANDY-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovq %xmm0, %rax # sched: [2:1.00] +; SANDY-NEXT: vmovq %xmm1, (%rsi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movd_64: ; HASWELL: # BB#0: @@ -1900,10 +1900,10 @@ define void @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) { ; ; SANDY-LABEL: test_movhpd: ; SANDY: # BB#0: -; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00] +; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00] ; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovhpd %xmm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movhpd: ; HASWELL: # BB#0: @@ -1951,10 +1951,10 @@ define void @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) { ; ; SANDY-LABEL: test_movlpd: ; SANDY: # BB#0: -; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00] +; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00] ; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovlpd %xmm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movlpd: ; HASWELL: # BB#0: @@ -1998,8 +1998,8 @@ define i32 @test_movmskpd(<2 x double> %a0) { ; ; SANDY-LABEL: test_movmskpd: ; SANDY: # BB#0: -; SANDY-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovmskpd %xmm0, %eax # sched: [2:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movmskpd: ; HASWELL: # BB#0: @@ -2039,8 +2039,8 @@ define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) { ; SANDY-LABEL: test_movntdqa: ; SANDY: # BB#0: ; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovntdq %xmm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movntdqa: ; HASWELL: # BB#0: @@ -2080,8 +2080,8 @@ define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) { ; SANDY-LABEL: test_movntpd: ; SANDY: # BB#0: ; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovntpd %xmm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movntpd: ; HASWELL: # BB#0: @@ -2123,10 +2123,10 @@ define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) { ; ; SANDY-LABEL: test_movq_mem: ; SANDY: # BB#0: -; SANDY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50] +; SANDY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50] ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovq %xmm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movq_mem: ; HASWELL: # BB#0: @@ -2174,7 +2174,7 @@ define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) { ; SANDY: # BB#0: ; SANDY-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33] ; SANDY-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movq_reg: ; HASWELL: # BB#0: @@ -2216,10 +2216,10 @@ define void @test_movsd_mem(double* %a0, double* %a1) { ; ; SANDY-LABEL: test_movsd_mem: ; SANDY: # BB#0: -; SANDY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50] +; SANDY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50] ; SANDY-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovsd %xmm0, (%rsi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movsd_mem: ; HASWELL: # BB#0: @@ -2266,7 +2266,7 @@ define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) { ; SANDY-LABEL: test_movsd_reg: ; SANDY: # BB#0: ; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movsd_reg: ; HASWELL: # BB#0: @@ -2305,10 +2305,10 @@ define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) { ; ; SANDY-LABEL: test_movupd: ; SANDY: # BB#0: -; SANDY-NEXT: vmovupd (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vmovupd (%rdi), %xmm0 # sched: [6:0.50] ; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movupd: ; HASWELL: # BB#0: @@ -2351,8 +2351,8 @@ define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; SANDY-LABEL: test_mulpd: ; SANDY: # BB#0: ; SANDY-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_mulpd: ; HASWELL: # BB#0: @@ -2393,8 +2393,8 @@ define double @test_mulsd(double %a0, double %a1, double *%a2) { ; SANDY-LABEL: test_mulsd: ; SANDY: # BB#0: ; SANDY-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_mulsd: ; HASWELL: # BB#0: @@ -2437,10 +2437,10 @@ define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; ; SANDY-LABEL: test_orpd: ; SANDY: # BB#0: -; SANDY-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_orpd: ; HASWELL: # BB#0: @@ -2496,8 +2496,8 @@ define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; SANDY-LABEL: test_packssdw: ; SANDY: # BB#0: ; SANDY-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_packssdw: ; HASWELL: # BB#0: @@ -2548,8 +2548,8 @@ define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_packsswb: ; SANDY: # BB#0: ; SANDY-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_packsswb: ; HASWELL: # BB#0: @@ -2600,8 +2600,8 @@ define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_packuswb: ; SANDY: # BB#0: ; SANDY-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_packuswb: ; HASWELL: # BB#0: @@ -2648,8 +2648,8 @@ define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-LABEL: test_paddb: ; SANDY: # BB#0: ; SANDY-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddb: ; HASWELL: # BB#0: @@ -2694,8 +2694,8 @@ define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; SANDY-LABEL: test_paddd: ; SANDY: # BB#0: ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddd: ; HASWELL: # BB#0: @@ -2736,8 +2736,8 @@ define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; SANDY-LABEL: test_paddq: ; SANDY: # BB#0: ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddq: ; HASWELL: # BB#0: @@ -2781,9 +2781,9 @@ define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; ; SANDY-LABEL: test_paddsb: ; SANDY: # BB#0: -; SANDY-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddsb: ; HASWELL: # BB#0: @@ -2828,9 +2828,9 @@ define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ; SANDY-LABEL: test_paddsw: ; SANDY: # BB#0: -; SANDY-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddsw: ; HASWELL: # BB#0: @@ -2876,8 +2876,8 @@ define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-LABEL: test_paddusb: ; SANDY: # BB#0: ; SANDY-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddusb: ; HASWELL: # BB#0: @@ -2923,8 +2923,8 @@ define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_paddusw: ; SANDY: # BB#0: ; SANDY-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddusw: ; HASWELL: # BB#0: @@ -2969,9 +2969,9 @@ define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ; SANDY-LABEL: test_paddw: ; SANDY: # BB#0: -; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_paddw: ; HASWELL: # BB#0: @@ -3015,9 +3015,9 @@ define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; SANDY-LABEL: test_pand: ; SANDY: # BB#0: ; SANDY-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [7:0.50] ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pand: ; HASWELL: # BB#0: @@ -3070,9 +3070,9 @@ define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; SANDY-LABEL: test_pandn: ; SANDY: # BB#0: ; SANDY-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [7:0.50] ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pandn: ; HASWELL: # BB#0: @@ -3122,8 +3122,8 @@ define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-LABEL: test_pavgb: ; SANDY: # BB#0: ; SANDY-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pavgb: ; HASWELL: # BB#0: @@ -3169,8 +3169,8 @@ define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_pavgw: ; SANDY: # BB#0: ; SANDY-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pavgw: ; HASWELL: # BB#0: @@ -3217,9 +3217,9 @@ define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-LABEL: test_pcmpeqb: ; SANDY: # BB#0: ; SANDY-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] -; SANDY-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [7:0.50] ; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpeqb: ; HASWELL: # BB#0: @@ -3269,9 +3269,9 @@ define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; SANDY-LABEL: test_pcmpeqd: ; SANDY: # BB#0: ; SANDY-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] -; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50] ; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpeqd: ; HASWELL: # BB#0: @@ -3321,9 +3321,9 @@ define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_pcmpeqw: ; SANDY: # BB#0: ; SANDY-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] -; SANDY-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] ; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpeqw: ; HASWELL: # BB#0: @@ -3374,9 +3374,9 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-LABEL: test_pcmpgtb: ; SANDY: # BB#0: ; SANDY-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] -; SANDY-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [7:0.50] ; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpgtb: ; HASWELL: # BB#0: @@ -3427,9 +3427,9 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; SANDY-LABEL: test_pcmpgtd: ; SANDY: # BB#0: ; SANDY-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] -; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50] ; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpgtd: ; HASWELL: # BB#0: @@ -3480,9 +3480,9 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_pcmpgtw: ; SANDY: # BB#0: ; SANDY-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] -; SANDY-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] ; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpgtw: ; HASWELL: # BB#0: @@ -3526,9 +3526,9 @@ define i16 @test_pextrw(<8 x i16> %a0) { ; ; SANDY-LABEL: test_pextrw: ; SANDY: # BB#0: -; SANDY-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.50] +; SANDY-NEXT: vpextrw $6, %xmm0, %eax # sched: [3:1.00] ; SANDY-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pextrw: ; HASWELL: # BB#0: @@ -3570,9 +3570,9 @@ define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) { ; ; SANDY-LABEL: test_pinsrw: ; SANDY: # BB#0: -; SANDY-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:1.00] +; SANDY-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pinsrw: ; HASWELL: # BB#0: @@ -3620,9 +3620,9 @@ define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ; SANDY-LABEL: test_pmaddwd: ; SANDY: # BB#0: -; SANDY-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaddwd: ; HASWELL: # BB#0: @@ -3669,8 +3669,8 @@ define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_pmaxsw: ; SANDY: # BB#0: ; SANDY-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaxsw: ; HASWELL: # BB#0: @@ -3716,8 +3716,8 @@ define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-LABEL: test_pmaxub: ; SANDY: # BB#0: ; SANDY-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaxub: ; HASWELL: # BB#0: @@ -3763,8 +3763,8 @@ define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_pminsw: ; SANDY: # BB#0: ; SANDY-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pminsw: ; HASWELL: # BB#0: @@ -3810,8 +3810,8 @@ define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-LABEL: test_pminub: ; SANDY: # BB#0: ; SANDY-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pminub: ; HASWELL: # BB#0: @@ -3851,8 +3851,8 @@ define i32 @test_pmovmskb(<16 x i8> %a0) { ; ; SANDY-LABEL: test_pmovmskb: ; SANDY: # BB#0: -; SANDY-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpmovmskb %xmm0, %eax # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovmskb: ; HASWELL: # BB#0: @@ -3891,7 +3891,7 @@ define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY: # BB#0: ; SANDY-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] ; SANDY-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmulhuw: ; HASWELL: # BB#0: @@ -3932,9 +3932,9 @@ define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ; SANDY-LABEL: test_pmulhw: ; SANDY: # BB#0: -; SANDY-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmulhw: ; HASWELL: # BB#0: @@ -3975,9 +3975,9 @@ define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ; SANDY-LABEL: test_pmullw: ; SANDY: # BB#0: -; SANDY-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmullw: ; HASWELL: # BB#0: @@ -4027,7 +4027,7 @@ define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; SANDY: # BB#0: ; SANDY-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] ; SANDY-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmuludq: ; HASWELL: # BB#0: @@ -4073,9 +4073,9 @@ define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; SANDY-LABEL: test_por: ; SANDY: # BB#0: ; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [7:0.50] ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_por: ; HASWELL: # BB#0: @@ -4126,9 +4126,9 @@ define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; ; SANDY-LABEL: test_psadbw: ; SANDY: # BB#0: -; SANDY-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psadbw: ; HASWELL: # BB#0: @@ -4176,9 +4176,9 @@ define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) { ; SANDY-LABEL: test_pshufd: ; SANDY: # BB#0: ; SANDY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50] -; SANDY-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:0.50] +; SANDY-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:0.50] ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pshufd: ; HASWELL: # BB#0: @@ -4226,10 +4226,10 @@ define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) { ; ; SANDY-LABEL: test_pshufhw: ; SANDY: # BB#0: -; SANDY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50] -; SANDY-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:0.50] -; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00] +; SANDY-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [7:0.50] +; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pshufhw: ; HASWELL: # BB#0: @@ -4278,9 +4278,9 @@ define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) { ; SANDY-LABEL: test_pshuflw: ; SANDY: # BB#0: ; SANDY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50] -; SANDY-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:0.50] -; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [7:0.50] +; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pshuflw: ; HASWELL: # BB#0: @@ -4326,10 +4326,10 @@ define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; ; SANDY-LABEL: test_pslld: ; SANDY: # BB#0: -; SANDY-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pslld: ; HASWELL: # BB#0: @@ -4378,7 +4378,7 @@ define <4 x i32> @test_pslldq(<4 x i32> %a0) { ; SANDY-LABEL: test_pslldq: ; SANDY: # BB#0: ; SANDY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pslldq: ; HASWELL: # BB#0: @@ -4417,10 +4417,10 @@ define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; ; SANDY-LABEL: test_psllq: ; SANDY: # BB#0: -; SANDY-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psllq: ; HASWELL: # BB#0: @@ -4468,10 +4468,10 @@ define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ; SANDY-LABEL: test_psllw: ; SANDY: # BB#0: -; SANDY-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psllw: ; HASWELL: # BB#0: @@ -4519,10 +4519,10 @@ define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; ; SANDY-LABEL: test_psrad: ; SANDY: # BB#0: -; SANDY-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; SANDY-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; SANDY-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psrad: ; HASWELL: # BB#0: @@ -4570,10 +4570,10 @@ define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ; SANDY-LABEL: test_psraw: ; SANDY: # BB#0: -; SANDY-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; SANDY-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; SANDY-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psraw: ; HASWELL: # BB#0: @@ -4621,10 +4621,10 @@ define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; ; SANDY-LABEL: test_psrld: ; SANDY: # BB#0: -; SANDY-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; SANDY-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; SANDY-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psrld: ; HASWELL: # BB#0: @@ -4673,7 +4673,7 @@ define <4 x i32> @test_psrldq(<4 x i32> %a0) { ; SANDY-LABEL: test_psrldq: ; SANDY: # BB#0: ; SANDY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psrldq: ; HASWELL: # BB#0: @@ -4712,10 +4712,10 @@ define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; ; SANDY-LABEL: test_psrlq: ; SANDY: # BB#0: -; SANDY-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; SANDY-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; SANDY-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psrlq: ; HASWELL: # BB#0: @@ -4763,10 +4763,10 @@ define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ; SANDY-LABEL: test_psrlw: ; SANDY: # BB#0: -; SANDY-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; SANDY-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; SANDY-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psrlw: ; HASWELL: # BB#0: @@ -4816,8 +4816,8 @@ define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-LABEL: test_psubb: ; SANDY: # BB#0: ; SANDY-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubb: ; HASWELL: # BB#0: @@ -4862,8 +4862,8 @@ define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; SANDY-LABEL: test_psubd: ; SANDY: # BB#0: ; SANDY-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubd: ; HASWELL: # BB#0: @@ -4904,8 +4904,8 @@ define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; SANDY-LABEL: test_psubq: ; SANDY: # BB#0: ; SANDY-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubq: ; HASWELL: # BB#0: @@ -4950,8 +4950,8 @@ define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-LABEL: test_psubsb: ; SANDY: # BB#0: ; SANDY-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubsb: ; HASWELL: # BB#0: @@ -4997,8 +4997,8 @@ define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_psubsw: ; SANDY: # BB#0: ; SANDY-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubsw: ; HASWELL: # BB#0: @@ -5044,8 +5044,8 @@ define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-LABEL: test_psubusb: ; SANDY: # BB#0: ; SANDY-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubusb: ; HASWELL: # BB#0: @@ -5091,8 +5091,8 @@ define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_psubusw: ; SANDY: # BB#0: ; SANDY-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubusw: ; HASWELL: # BB#0: @@ -5138,8 +5138,8 @@ define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_psubw: ; SANDY: # BB#0: ; SANDY-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psubw: ; HASWELL: # BB#0: @@ -5184,8 +5184,8 @@ define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-LABEL: test_punpckhbw: ; SANDY: # BB#0: ; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50] -; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpckhbw: ; HASWELL: # BB#0: @@ -5231,9 +5231,9 @@ define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; SANDY-LABEL: test_punpckhdq: ; SANDY: # BB#0: ; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] -; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:0.50] +; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50] ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpckhdq: ; HASWELL: # BB#0: @@ -5279,10 +5279,10 @@ define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) ; ; SANDY-LABEL: test_punpckhqdq: ; SANDY: # BB#0: -; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50] -; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:0.50] +; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] +; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50] ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpckhqdq: ; HASWELL: # BB#0: @@ -5330,8 +5330,8 @@ define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_punpckhwd: ; SANDY: # BB#0: ; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] -; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpckhwd: ; HASWELL: # BB#0: @@ -5375,9 +5375,9 @@ define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; ; SANDY-LABEL: test_punpcklbw: ; SANDY: # BB#0: -; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] -; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00] +; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpcklbw: ; HASWELL: # BB#0: @@ -5423,9 +5423,9 @@ define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; SANDY-LABEL: test_punpckldq: ; SANDY: # BB#0: ; SANDY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50] -; SANDY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:0.50] +; SANDY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50] ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpckldq: ; HASWELL: # BB#0: @@ -5472,9 +5472,9 @@ define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) ; SANDY-LABEL: test_punpcklqdq: ; SANDY: # BB#0: ; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] -; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:0.50] +; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50] ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpcklqdq: ; HASWELL: # BB#0: @@ -5522,8 +5522,8 @@ define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_punpcklwd: ; SANDY: # BB#0: ; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] -; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_punpcklwd: ; HASWELL: # BB#0: @@ -5567,9 +5567,9 @@ define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; SANDY-LABEL: test_pxor: ; SANDY: # BB#0: ; SANDY-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [7:0.50] ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pxor: ; HASWELL: # BB#0: @@ -5616,9 +5616,9 @@ define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double ; SANDY-LABEL: test_shufpd: ; SANDY: # BB#0: ; SANDY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00] -; SANDY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00] +; SANDY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:1.00] ; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_shufpd: ; HASWELL: # BB#0: @@ -5665,10 +5665,10 @@ define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) { ; ; SANDY-LABEL: test_sqrtpd: ; SANDY: # BB#0: -; SANDY-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [15:1.00] -; SANDY-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [19:1.00] +; SANDY-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [22:1.00] +; SANDY-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [28:1.00] ; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_sqrtpd: ; HASWELL: # BB#0: @@ -5720,11 +5720,11 @@ define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) { ; ; SANDY-LABEL: test_sqrtsd: ; SANDY: # BB#0: -; SANDY-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00] -; SANDY-NEXT: vmovapd (%rdi), %xmm1 # sched: [4:0.50] -; SANDY-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00] +; SANDY-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00] +; SANDY-NEXT: vmovapd (%rdi), %xmm1 # sched: [6:0.50] +; SANDY-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:1.00] ; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_sqrtsd: ; HASWELL: # BB#0: @@ -5771,8 +5771,8 @@ define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; SANDY-LABEL: test_subpd: ; SANDY: # BB#0: ; SANDY-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_subpd: ; HASWELL: # BB#0: @@ -5813,8 +5813,8 @@ define double @test_subsd(double %a0, double %a1, double *%a2) { ; SANDY-LABEL: test_subsd: ; SANDY: # BB#0: ; SANDY-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_subsd: ; HASWELL: # BB#0: @@ -5879,16 +5879,16 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) ; SANDY-LABEL: test_ucomisd: ; SANDY: # BB#0: ; SANDY-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: setnp %al # sched: [1:0.33] -; SANDY-NEXT: sete %cl # sched: [1:0.33] +; SANDY-NEXT: setnp %al # sched: [1:1.00] +; SANDY-NEXT: sete %cl # sched: [1:1.00] ; SANDY-NEXT: andb %al, %cl # sched: [1:0.33] ; SANDY-NEXT: vucomisd (%rdi), %xmm0 # sched: [7:1.00] -; SANDY-NEXT: setnp %al # sched: [1:0.33] -; SANDY-NEXT: sete %dl # sched: [1:0.33] +; SANDY-NEXT: setnp %al # sched: [1:1.00] +; SANDY-NEXT: sete %dl # sched: [1:1.00] ; SANDY-NEXT: andb %al, %dl # sched: [1:0.33] ; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33] ; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_ucomisd: ; HASWELL: # BB#0: @@ -5950,9 +5950,9 @@ define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; SANDY-LABEL: test_unpckhpd: ; SANDY: # BB#0: ; SANDY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] -; SANDY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00] +; SANDY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00] ; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_unpckhpd: ; HASWELL: # BB#0: @@ -6005,9 +6005,9 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; SANDY-LABEL: test_unpcklpd: ; SANDY: # BB#0: ; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00] +; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00] ; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_unpcklpd: ; HASWELL: # BB#0: @@ -6053,10 +6053,10 @@ define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; ; SANDY-LABEL: test_xorpd: ; SANDY: # BB#0: -; SANDY-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] ; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_xorpd: ; HASWELL: # BB#0: diff --git a/test/CodeGen/X86/sse3-schedule.ll b/test/CodeGen/X86/sse3-schedule.ll index 482b2fcab6425..ef1ddae4532d4 100644 --- a/test/CodeGen/X86/sse3-schedule.ll +++ b/test/CodeGen/X86/sse3-schedule.ll @@ -31,8 +31,8 @@ define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; SANDY-LABEL: test_addsubpd: ; SANDY: # BB#0: ; SANDY-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_addsubpd: ; HASWELL: # BB#0: @@ -74,8 +74,8 @@ define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; SANDY-LABEL: test_addsubps: ; SANDY: # BB#0: ; SANDY-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_addsubps: ; HASWELL: # BB#0: @@ -116,9 +116,9 @@ define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double ; ; SANDY-LABEL: test_haddpd: ; SANDY: # BB#0: -; SANDY-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00] +; SANDY-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_haddpd: ; HASWELL: # BB#0: @@ -159,9 +159,9 @@ define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *% ; ; SANDY-LABEL: test_haddps: ; SANDY: # BB#0: -; SANDY-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00] +; SANDY-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [11:2.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_haddps: ; HASWELL: # BB#0: @@ -202,9 +202,9 @@ define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double ; ; SANDY-LABEL: test_hsubpd: ; SANDY: # BB#0: -; SANDY-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00] +; SANDY-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_hsubpd: ; HASWELL: # BB#0: @@ -245,9 +245,9 @@ define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *% ; ; SANDY-LABEL: test_hsubps: ; SANDY: # BB#0: -; SANDY-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00] +; SANDY-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [11:2.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_hsubps: ; HASWELL: # BB#0: @@ -287,8 +287,8 @@ define <16 x i8> @test_lddqu(i8* %a0) { ; ; SANDY-LABEL: test_lddqu: ; SANDY: # BB#0: -; SANDY-NEXT: vlddqu (%rdi), %xmm0 # sched: [4:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vlddqu (%rdi), %xmm0 # sched: [6:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_lddqu: ; HASWELL: # BB#0: @@ -330,9 +330,9 @@ define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) { ; SANDY-LABEL: test_movddup: ; SANDY: # BB#0: ; SANDY-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00] -; SANDY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [4:0.50] +; SANDY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [6:0.50] ; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movddup: ; HASWELL: # BB#0: @@ -380,9 +380,9 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) { ; SANDY-LABEL: test_movshdup: ; SANDY: # BB#0: ; SANDY-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00] -; SANDY-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:0.50] +; SANDY-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [6:0.50] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movshdup: ; HASWELL: # BB#0: @@ -430,9 +430,9 @@ define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) { ; SANDY-LABEL: test_movsldup: ; SANDY: # BB#0: ; SANDY-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00] -; SANDY-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:0.50] +; SANDY-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [6:0.50] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movsldup: ; HASWELL: # BB#0: diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll index 340b9abe88797..1ab1598fcab7c 100644 --- a/test/CodeGen/X86/sse41-schedule.ll +++ b/test/CodeGen/X86/sse41-schedule.ll @@ -25,10 +25,10 @@ define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x doubl ; ; SANDY-LABEL: test_blendpd: ; SANDY: # BB#0: -; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50] +; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:1.00] ; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [7:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_blendpd: ; HASWELL: # BB#0: @@ -65,9 +65,9 @@ define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> * ; ; SANDY-LABEL: test_blendps: ; SANDY: # BB#0: -; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50] -; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:1.00] +; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [7:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_blendps: ; HASWELL: # BB#0: @@ -107,9 +107,9 @@ define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; ; SANDY-LABEL: test_blendvpd: ; SANDY: # BB#0: -; SANDY-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; SANDY-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00] +; SANDY-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_blendvpd: ; HASWELL: # BB#0: @@ -150,9 +150,9 @@ define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; ; SANDY-LABEL: test_blendvps: ; SANDY: # BB#0: -; SANDY-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; SANDY-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00] +; SANDY-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_blendvps: ; HASWELL: # BB#0: @@ -187,9 +187,9 @@ define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double> ; ; SANDY-LABEL: test_dppd: ; SANDY: # BB#0: -; SANDY-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [15:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_dppd: ; HASWELL: # BB#0: @@ -224,9 +224,9 @@ define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2 ; ; SANDY-LABEL: test_dpps: ; SANDY: # BB#0: -; SANDY-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [12:2.00] ; SANDY-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_dpps: ; HASWELL: # BB#0: @@ -262,8 +262,8 @@ define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2) ; SANDY-LABEL: test_insertps: ; SANDY: # BB#0: ; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00] -; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_insertps: ; HASWELL: # BB#0: @@ -296,8 +296,8 @@ define <2 x i64> @test_movntdqa(i8* %a0) { ; ; SANDY-LABEL: test_movntdqa: ; SANDY: # BB#0: -; SANDY-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [4:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [6:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movntdqa: ; HASWELL: # BB#0: @@ -328,9 +328,9 @@ define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; ; SANDY-LABEL: test_mpsadbw: ; SANDY: # BB#0: -; SANDY-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [6:1.00] -; SANDY-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_mpsadbw: ; HASWELL: # BB#0: @@ -367,8 +367,8 @@ define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; SANDY-LABEL: test_packusdw: ; SANDY: # BB#0: ; SANDY-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_packusdw: ; HASWELL: # BB#0: @@ -411,8 +411,8 @@ define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16 ; SANDY-LABEL: test_pblendvb: ; SANDY: # BB#0: ; SANDY-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; SANDY-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pblendvb: ; HASWELL: # BB#0: @@ -448,8 +448,8 @@ define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_pblendw: ; SANDY: # BB#0: ; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50] -; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pblendw: ; HASWELL: # BB#0: @@ -483,9 +483,9 @@ define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; ; SANDY-LABEL: test_pcmpeqq: ; SANDY: # BB#0: -; SANDY-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpeqq: ; HASWELL: # BB#0: @@ -521,9 +521,9 @@ define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) { ; ; SANDY-LABEL: test_pextrb: ; SANDY: # BB#0: -; SANDY-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.50] +; SANDY-NEXT: vpextrb $3, %xmm0, %eax # sched: [3:1.00] ; SANDY-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pextrb: ; HASWELL: # BB#0: @@ -558,9 +558,9 @@ define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) { ; ; SANDY-LABEL: test_pextrd: ; SANDY: # BB#0: -; SANDY-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50] +; SANDY-NEXT: vpextrd $3, %xmm0, %eax # sched: [3:1.00] ; SANDY-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pextrd: ; HASWELL: # BB#0: @@ -594,9 +594,9 @@ define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) { ; ; SANDY-LABEL: test_pextrq: ; SANDY: # BB#0: -; SANDY-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.50] +; SANDY-NEXT: vpextrq $1, %xmm0, %rax # sched: [3:1.00] ; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pextrq: ; HASWELL: # BB#0: @@ -630,9 +630,9 @@ define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) { ; ; SANDY-LABEL: test_pextrw: ; SANDY: # BB#0: -; SANDY-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.50] +; SANDY-NEXT: vpextrw $3, %xmm0, %eax # sched: [3:1.00] ; SANDY-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pextrw: ; HASWELL: # BB#0: @@ -667,9 +667,9 @@ define <8 x i16> @test_phminposuw(<8 x i16> *%a0) { ; ; SANDY-LABEL: test_phminposuw: ; SANDY: # BB#0: -; SANDY-NEXT: vphminposuw (%rdi), %xmm0 # sched: [9:1.00] +; SANDY-NEXT: vphminposuw (%rdi), %xmm0 # sched: [11:1.00] ; SANDY-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_phminposuw: ; HASWELL: # BB#0: @@ -704,9 +704,9 @@ define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) { ; ; SANDY-LABEL: test_pinsrb: ; SANDY: # BB#0: -; SANDY-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:1.00] +; SANDY-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pinsrb: ; HASWELL: # BB#0: @@ -740,9 +740,9 @@ define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) { ; ; SANDY-LABEL: test_pinsrd: ; SANDY: # BB#0: -; SANDY-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:1.00] +; SANDY-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pinsrd: ; HASWELL: # BB#0: @@ -778,10 +778,10 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) { ; ; SANDY-LABEL: test_pinsrq: ; SANDY: # BB#0: -; SANDY-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:1.00] +; SANDY-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [7:0.50] ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pinsrq: ; HASWELL: # BB#0: @@ -819,8 +819,8 @@ define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-LABEL: test_pmaxsb: ; SANDY: # BB#0: ; SANDY-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaxsb: ; HASWELL: # BB#0: @@ -856,8 +856,8 @@ define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; SANDY-LABEL: test_pmaxsd: ; SANDY: # BB#0: ; SANDY-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaxsd: ; HASWELL: # BB#0: @@ -893,8 +893,8 @@ define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; SANDY-LABEL: test_pmaxud: ; SANDY: # BB#0: ; SANDY-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaxud: ; HASWELL: # BB#0: @@ -930,8 +930,8 @@ define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_pmaxuw: ; SANDY: # BB#0: ; SANDY-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaxuw: ; HASWELL: # BB#0: @@ -967,8 +967,8 @@ define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-LABEL: test_pminsb: ; SANDY: # BB#0: ; SANDY-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pminsb: ; HASWELL: # BB#0: @@ -1004,8 +1004,8 @@ define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; SANDY-LABEL: test_pminsd: ; SANDY: # BB#0: ; SANDY-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pminsd: ; HASWELL: # BB#0: @@ -1041,8 +1041,8 @@ define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; SANDY-LABEL: test_pminud: ; SANDY: # BB#0: ; SANDY-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pminud: ; HASWELL: # BB#0: @@ -1078,8 +1078,8 @@ define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_pminuw: ; SANDY: # BB#0: ; SANDY-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pminuw: ; HASWELL: # BB#0: @@ -1118,9 +1118,9 @@ define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) { ; SANDY-LABEL: test_pmovsxbw: ; SANDY: # BB#0: ; SANDY-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [5:0.50] -; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [7:0.50] +; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovsxbw: ; HASWELL: # BB#0: @@ -1162,9 +1162,9 @@ define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) { ; SANDY-LABEL: test_pmovsxbd: ; SANDY: # BB#0: ; SANDY-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [7:0.50] ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovsxbd: ; HASWELL: # BB#0: @@ -1206,9 +1206,9 @@ define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) { ; SANDY-LABEL: test_pmovsxbq: ; SANDY: # BB#0: ; SANDY-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [7:0.50] ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovsxbq: ; HASWELL: # BB#0: @@ -1250,9 +1250,9 @@ define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) { ; SANDY-LABEL: test_pmovsxdq: ; SANDY: # BB#0: ; SANDY-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [7:0.50] ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovsxdq: ; HASWELL: # BB#0: @@ -1294,9 +1294,9 @@ define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) { ; SANDY-LABEL: test_pmovsxwd: ; SANDY: # BB#0: ; SANDY-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [7:0.50] ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovsxwd: ; HASWELL: # BB#0: @@ -1338,9 +1338,9 @@ define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) { ; SANDY-LABEL: test_pmovsxwq: ; SANDY: # BB#0: ; SANDY-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [7:0.50] ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovsxwq: ; HASWELL: # BB#0: @@ -1382,9 +1382,9 @@ define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) { ; SANDY-LABEL: test_pmovzxbw: ; SANDY: # BB#0: ; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50] -; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:0.50] -; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [7:0.50] +; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovzxbw: ; HASWELL: # BB#0: @@ -1426,9 +1426,9 @@ define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) { ; SANDY-LABEL: test_pmovzxbd: ; SANDY: # BB#0: ; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50] -; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:0.50] +; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [7:0.50] ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovzxbd: ; HASWELL: # BB#0: @@ -1470,9 +1470,9 @@ define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) { ; SANDY-LABEL: test_pmovzxbq: ; SANDY: # BB#0: ; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50] -; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:0.50] +; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [7:0.50] ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovzxbq: ; HASWELL: # BB#0: @@ -1514,9 +1514,9 @@ define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) { ; SANDY-LABEL: test_pmovzxdq: ; SANDY: # BB#0: ; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50] -; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [5:0.50] +; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [7:0.50] ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovzxdq: ; HASWELL: # BB#0: @@ -1558,9 +1558,9 @@ define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) { ; SANDY-LABEL: test_pmovzxwd: ; SANDY: # BB#0: ; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50] -; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:0.50] +; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [7:0.50] ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovzxwd: ; HASWELL: # BB#0: @@ -1602,9 +1602,9 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) { ; SANDY-LABEL: test_pmovzxwq: ; SANDY: # BB#0: ; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50] -; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:0.50] +; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [7:0.50] ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmovzxwq: ; HASWELL: # BB#0: @@ -1642,9 +1642,9 @@ define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; ; SANDY-LABEL: test_pmuldq: ; SANDY: # BB#0: -; SANDY-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmuldq: ; HASWELL: # BB#0: @@ -1680,9 +1680,9 @@ define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; ; SANDY-LABEL: test_pmulld: ; SANDY: # BB#0: -; SANDY-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmulld: ; HASWELL: # BB#0: @@ -1724,13 +1724,13 @@ define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; ; SANDY-LABEL: test_ptest: ; SANDY: # BB#0: -; SANDY-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: setb %al # sched: [1:0.33] -; SANDY-NEXT: vptest (%rdi), %xmm0 # sched: [5:0.50] -; SANDY-NEXT: setb %cl # sched: [1:0.33] +; SANDY-NEXT: vptest %xmm1, %xmm0 # sched: [2:1.00] +; SANDY-NEXT: setb %al # sched: [1:1.00] +; SANDY-NEXT: vptest (%rdi), %xmm0 # sched: [8:1.00] +; SANDY-NEXT: setb %cl # sched: [1:1.00] ; SANDY-NEXT: andb %al, %cl # sched: [1:0.33] ; SANDY-NEXT: movzbl %cl, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_ptest: ; HASWELL: # BB#0: @@ -1778,9 +1778,9 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) { ; SANDY-LABEL: test_roundpd: ; SANDY: # BB#0: ; SANDY-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [9:1.00] ; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_roundpd: ; HASWELL: # BB#0: @@ -1822,9 +1822,9 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) { ; SANDY-LABEL: test_roundps: ; SANDY: # BB#0: ; SANDY-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [9:1.00] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_roundps: ; HASWELL: # BB#0: @@ -1867,9 +1867,9 @@ define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x doubl ; SANDY-LABEL: test_roundsd: ; SANDY: # BB#0: ; SANDY-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00] -; SANDY-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00] ; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_roundsd: ; HASWELL: # BB#0: @@ -1912,9 +1912,9 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> * ; SANDY-LABEL: test_roundss: ; SANDY: # BB#0: ; SANDY-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00] -; SANDY-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00] ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_roundss: ; HASWELL: # BB#0: diff --git a/test/CodeGen/X86/sse42-schedule.ll b/test/CodeGen/X86/sse42-schedule.ll index afc48bc57ee7d..7ce9ffdbd0ea1 100644 --- a/test/CodeGen/X86/sse42-schedule.ll +++ b/test/CodeGen/X86/sse42-schedule.ll @@ -26,9 +26,9 @@ define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) { ; SANDY-LABEL: crc32_32_8: ; SANDY: # BB#0: ; SANDY-NEXT: crc32b %sil, %edi # sched: [3:1.00] -; SANDY-NEXT: crc32b (%rdx), %edi # sched: [7:1.00] +; SANDY-NEXT: crc32b (%rdx), %edi # sched: [8:1.00] ; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: crc32_32_8: ; HASWELL: # BB#0: @@ -68,9 +68,9 @@ define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) { ; SANDY-LABEL: crc32_32_16: ; SANDY: # BB#0: ; SANDY-NEXT: crc32w %si, %edi # sched: [3:1.00] -; SANDY-NEXT: crc32w (%rdx), %edi # sched: [7:1.00] +; SANDY-NEXT: crc32w (%rdx), %edi # sched: [8:1.00] ; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: crc32_32_16: ; HASWELL: # BB#0: @@ -112,7 +112,7 @@ define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) { ; SANDY-NEXT: crc32l %esi, %edi # sched: [3:1.00] ; SANDY-NEXT: crc32l (%rdx), %edi # sched: [7:1.00] ; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: crc32_32_32: ; HASWELL: # BB#0: @@ -152,9 +152,9 @@ define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind { ; SANDY-LABEL: crc32_64_8: ; SANDY: # BB#0: ; SANDY-NEXT: crc32b %sil, %edi # sched: [3:1.00] -; SANDY-NEXT: crc32b (%rdx), %edi # sched: [7:1.00] +; SANDY-NEXT: crc32b (%rdx), %edi # sched: [8:1.00] ; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: crc32_64_8: ; HASWELL: # BB#0: @@ -196,7 +196,7 @@ define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) { ; SANDY-NEXT: crc32q %rsi, %rdi # sched: [3:1.00] ; SANDY-NEXT: crc32q (%rdx), %rdi # sched: [7:1.00] ; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: crc32_64_64: ; HASWELL: # BB#0: @@ -256,7 +256,7 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [4:2.33] ; SANDY-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> ; SANDY-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpestri: ; HASWELL: # BB#0: @@ -320,7 +320,7 @@ define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-NEXT: movl $7, %eax # sched: [1:0.33] ; SANDY-NEXT: movl $7, %edx # sched: [1:0.33] ; SANDY-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [11:2.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpestrm: ; HASWELL: # BB#0: @@ -369,12 +369,12 @@ define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; ; SANDY-LABEL: test_pcmpistri: ; SANDY: # BB#0: -; SANDY-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00] ; SANDY-NEXT: movl %ecx, %eax # sched: [1:0.33] -; SANDY-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [17:3.00] ; SANDY-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def> ; SANDY-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpistri: ; HASWELL: # BB#0: @@ -416,9 +416,9 @@ define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; ; SANDY-LABEL: test_pcmpistrm: ; SANDY: # BB#0: -; SANDY-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:1.00] -; SANDY-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [11:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00] +; SANDY-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [17:3.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpistrm: ; HASWELL: # BB#0: @@ -453,9 +453,9 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { ; ; SANDY-LABEL: test_pcmpgtq: ; SANDY: # BB#0: -; SANDY-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pcmpgtq: ; HASWELL: # BB#0: diff --git a/test/CodeGen/X86/sse4a-schedule.ll b/test/CodeGen/X86/sse4a-schedule.ll new file mode 100644 index 0000000000000..11afdb7989f15 --- /dev/null +++ b/test/CodeGen/X86/sse4a-schedule.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4a | FileCheck %s --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=BTVER2 + +define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) { +; GENERIC-LABEL: test_extrq: +; GENERIC: # BB#0: +; GENERIC-NEXT: extrq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; BTVER2-LABEL: test_extrq: +; BTVER2: # BB#0: +; BTVER2-NEXT: extrq %xmm1, %xmm0 +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %a0, <16 x i8> %a1) + ret <2 x i64> %1 +} +declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) + +define <2 x i64> @test_extrqi(<2 x i64> %a0) { +; GENERIC-LABEL: test_extrqi: +; GENERIC: # BB#0: +; GENERIC-NEXT: extrq $2, $3, %xmm0 +; GENERIC-NEXT: retq +; +; BTVER2-LABEL: test_extrqi: +; BTVER2: # BB#0: +; BTVER2-NEXT: extrq $2, $3, %xmm0 +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a0, i8 3, i8 2) + ret <2 x i64> %1 +} +declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) + +define <2 x i64> @test_insertq(<2 x i64> %a0, <2 x i64> %a1) { +; GENERIC-LABEL: test_insertq: +; GENERIC: # BB#0: +; GENERIC-NEXT: insertq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; BTVER2-LABEL: test_insertq: +; BTVER2: # BB#0: +; BTVER2-NEXT: insertq %xmm1, %xmm0 +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %a0, <2 x i64> %a1) + ret <2 x i64> %1 +} +declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_insertqi(<2 x i64> %a0, <2 x i64> %a1) { +; GENERIC-LABEL: test_insertqi: +; GENERIC: # BB#0: +; GENERIC-NEXT: insertq $6, $5, %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; BTVER2-LABEL: test_insertqi: +; BTVER2: # BB#0: +; BTVER2-NEXT: insertq $6, $5, %xmm1, %xmm0 +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 6) + ret <2 x i64> %1 +} +declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) + +define void @test_movntsd(i8* %p, <2 x double> %a) { +; GENERIC-LABEL: test_movntsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movntsd %xmm0, (%rdi) +; GENERIC-NEXT: retq +; +; BTVER2-LABEL: test_movntsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: movntsd %xmm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) + ret void +} +declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>) + +define void @test_movntss(i8* %p, <4 x float> %a) { +; GENERIC-LABEL: test_movntss: +; GENERIC: # BB#0: +; GENERIC-NEXT: movntss %xmm0, (%rdi) +; GENERIC-NEXT: retq +; +; BTVER2-LABEL: test_movntss: +; BTVER2: # BB#0: +; BTVER2-NEXT: movntss %xmm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) + ret void +} +declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>) + diff --git a/test/CodeGen/X86/ssse3-schedule.ll b/test/CodeGen/X86/ssse3-schedule.ll index 8b7a0c0ec02b6..f24969a30c337 100644 --- a/test/CodeGen/X86/ssse3-schedule.ll +++ b/test/CodeGen/X86/ssse3-schedule.ll @@ -35,9 +35,9 @@ define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) { ; SANDY-LABEL: test_pabsb: ; SANDY: # BB#0: ; SANDY-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpabsb (%rdi), %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpabsb (%rdi), %xmm1 # sched: [7:0.50] ; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pabsb: ; HASWELL: # BB#0: @@ -86,9 +86,9 @@ define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) { ; SANDY-LABEL: test_pabsd: ; SANDY: # BB#0: ; SANDY-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpabsd (%rdi), %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpabsd (%rdi), %xmm1 # sched: [7:0.50] ; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pabsd: ; HASWELL: # BB#0: @@ -136,7 +136,7 @@ define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) { ; SANDY-LABEL: test_pabsw: ; SANDY: # BB#0: ; SANDY-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pabsw: ; HASWELL: # BB#0: @@ -182,8 +182,8 @@ define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_palignr: ; SANDY: # BB#0: ; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50] -; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_palignr: ; HASWELL: # BB#0: @@ -223,9 +223,9 @@ define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; ; SANDY-LABEL: test_phaddd: ; SANDY: # BB#0: -; SANDY-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:1.50] +; SANDY-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [9:1.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_phaddd: ; HASWELL: # BB#0: @@ -274,9 +274,9 @@ define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ; SANDY-LABEL: test_phaddsw: ; SANDY: # BB#0: -; SANDY-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50] +; SANDY-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_phaddsw: ; HASWELL: # BB#0: @@ -317,9 +317,9 @@ define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ; SANDY-LABEL: test_phaddw: ; SANDY: # BB#0: -; SANDY-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.50] +; SANDY-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [9:1.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_phaddw: ; HASWELL: # BB#0: @@ -360,9 +360,9 @@ define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; ; SANDY-LABEL: test_phsubd: ; SANDY: # BB#0: -; SANDY-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:1.50] +; SANDY-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [9:1.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_phsubd: ; HASWELL: # BB#0: @@ -411,9 +411,9 @@ define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ; SANDY-LABEL: test_phsubsw: ; SANDY: # BB#0: -; SANDY-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50] +; SANDY-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_phsubsw: ; HASWELL: # BB#0: @@ -454,9 +454,9 @@ define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ; SANDY-LABEL: test_phsubw: ; SANDY: # BB#0: -; SANDY-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:1.50] +; SANDY-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [9:1.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_phsubw: ; HASWELL: # BB#0: @@ -497,9 +497,9 @@ define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; ; SANDY-LABEL: test_pmaddubsw: ; SANDY: # BB#0: -; SANDY-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmaddubsw: ; HASWELL: # BB#0: @@ -538,8 +538,8 @@ define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; ; SANDY-LABEL: test_pmulhrsw: ; SANDY: # BB#0: -; SANDY-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmulhrsw: ; HASWELL: # BB#0: @@ -579,8 +579,8 @@ define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-LABEL: test_pshufb: ; SANDY: # BB#0: ; SANDY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pshufb: ; HASWELL: # BB#0: @@ -630,8 +630,8 @@ define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { ; SANDY-LABEL: test_psignb: ; SANDY: # BB#0: ; SANDY-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psignb: ; HASWELL: # BB#0: @@ -681,8 +681,8 @@ define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; SANDY-LABEL: test_psignd: ; SANDY: # BB#0: ; SANDY-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psignd: ; HASWELL: # BB#0: @@ -732,8 +732,8 @@ define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { ; SANDY-LABEL: test_psignw: ; SANDY: # BB#0: ; SANDY-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] -; SANDY-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] -; SANDY-NEXT: retq # sched: [5:1.00] +; SANDY-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50] +; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_psignw: ; HASWELL: # BB#0: diff --git a/test/CodeGen/X86/swizzle-avx2.ll b/test/CodeGen/X86/swizzle-avx2.ll index 29dfa6c2dcc17..6ca9126eb09df 100644 --- a/test/CodeGen/X86/swizzle-avx2.ll +++ b/test/CodeGen/X86/swizzle-avx2.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s ; Test that we correctly fold a shuffle that performs a swizzle of another ; shuffle node according to the rule @@ -11,81 +12,77 @@ ; Check that we produce a single vector permute / shuffle in all cases. define <8 x i32> @swizzle_1(<8 x i32> %v) { +; CHECK-LABEL: swizzle_1: +; CHECK: # BB#0: +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,0,4,5,6,7] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 7, i32 5, i32 6, i32 4> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4> ret <8 x i32> %2 } -; CHECK-LABEL: swizzle_1 -; CHECK: vpermd -; CHECK-NOT: vpermd -; CHECK: ret - define <8 x i32> @swizzle_2(<8 x i32> %v) { +; CHECK-LABEL: swizzle_2: +; CHECK: # BB#0: +; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3> ret <8 x i32> %2 } -; CHECK-LABEL: swizzle_2 -; CHECK: vpshufd $78 -; CHECK-NOT: vpermd -; CHECK-NOT: vpshufd -; CHECK: ret - define <8 x i32> @swizzle_3(<8 x i32> %v) { +; CHECK-LABEL: swizzle_3: +; CHECK: # BB#0: +; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1> ret <8 x i32> %2 } -; CHECK-LABEL: swizzle_3 -; CHECK: vpshufd $78 -; CHECK-NOT: vpermd -; CHECK-NOT: vpshufd -; CHECK: ret - define <8 x i32> @swizzle_4(<8 x i32> %v) { +; CHECK-LABEL: swizzle_4: +; CHECK: # BB#0: +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,2,0,6,5,4,7] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 4, i32 7, i32 5, i32 6, i32 3, i32 2, i32 0, i32 1> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 4, i32 7, i32 5, i32 6, i32 3, i32 2, i32 0, i32 1> ret <8 x i32> %2 } -; CHECK-LABEL: swizzle_4 -; CHECK: vpermd -; CHECK-NOT: vpermd -; CHECK: ret - define <8 x i32> @swizzle_5(<8 x i32> %v) { +; CHECK-LABEL: swizzle_5: +; CHECK: # BB#0: +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,0,1,2,7,6,4,5] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 0, i32 2, i32 1, i32 3> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 0, i32 2, i32 1, i32 3> ret <8 x i32> %2 } -; CHECK-LABEL: swizzle_5 -; CHECK: vpermd -; CHECK-NOT: vpermd -; CHECK: ret - define <8 x i32> @swizzle_6(<8 x i32> %v) { +; CHECK-LABEL: swizzle_6: +; CHECK: # BB#0: +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,0,2,4,5,6,7] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 0, i32 4, i32 7, i32 6, i32 5> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 0, i32 4, i32 7, i32 6, i32 5> ret <8 x i32> %2 } -; CHECK-LABEL: swizzle_6 -; CHECK: vpermd -; CHECK-NOT: vpermd -; CHECK: ret - define <8 x i32> @swizzle_7(<8 x i32> %v) { +; CHECK-LABEL: swizzle_7: +; CHECK: # BB#0: +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,3,1,4,5,6,7] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i32 6, i32 7> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i32 6, i32 7> ret <8 x i32> %2 } -; CHECK-LABEL: swizzle_7 -; CHECK: vpermd -; CHECK-NOT: vpermd -; CHECK: ret - diff --git a/test/CodeGen/X86/tbm_patterns.ll b/test/CodeGen/X86/tbm_patterns.ll index 80d36d5af4d2c..5ce6bbd4b49ea 100644 --- a/test/CodeGen/X86/tbm_patterns.ll +++ b/test/CodeGen/X86/tbm_patterns.ll @@ -1,253 +1,255 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+tbm < %s | FileCheck %s -define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_bextri_u32: - ; CHECK-NOT: mov - ; CHECK: bextr $ - %0 = lshr i32 %a, 4 - %1 = and i32 %0, 4095 - ret i32 %1 -} - -define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind readonly { -entry: - ; CHECK-LABEL: test_x86_tbm_bextri_u32_m: - ; CHECK-NOT: mov - ; CHECK: bextr $ - %0 = load i32, i32* %a - %1 = lshr i32 %0, 4 - %2 = and i32 %1, 4095 - ret i32 %2 -} - -define i64 @test_x86_tbm_bextri_u64(i64 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_bextri_u64: - ; CHECK-NOT: mov - ; CHECK: bextr $ - %0 = lshr i64 %a, 4 - %1 = and i64 %0, 4095 - ret i64 %1 -} - -define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind readonly { -entry: - ; CHECK-LABEL: test_x86_tbm_bextri_u64_m: - ; CHECK-NOT: mov - ; CHECK: bextr $ - %0 = load i64, i64* %a - %1 = lshr i64 %0, 4 - %2 = and i64 %1, 4095 - ret i64 %2 -} - -define i32 @test_x86_tbm_blcfill_u32(i32 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blcfill_u32: - ; CHECK-NOT: mov - ; CHECK: blcfill % - %0 = add i32 %a, 1 - %1 = and i32 %0, %a - ret i32 %1 -} - -define i64 @test_x86_tbm_blcfill_u64(i64 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blcfill_u64: - ; CHECK-NOT: mov - ; CHECK: blcfill % - %0 = add i64 %a, 1 - %1 = and i64 %0, %a - ret i64 %1 -} - -define i32 @test_x86_tbm_blci_u32(i32 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blci_u32: - ; CHECK-NOT: mov - ; CHECK: blci % - %0 = add i32 1, %a - %1 = xor i32 %0, -1 - %2 = or i32 %1, %a - ret i32 %2 -} - -define i64 @test_x86_tbm_blci_u64(i64 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blci_u64: - ; CHECK-NOT: mov - ; CHECK: blci % - %0 = add i64 1, %a - %1 = xor i64 %0, -1 - %2 = or i64 %1, %a - ret i64 %2 -} - -define i32 @test_x86_tbm_blci_u32_b(i32 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blci_u32_b: - ; CHECK-NOT: mov - ; CHECK: blci % - %0 = sub i32 -2, %a - %1 = or i32 %0, %a - ret i32 %1 -} - -define i64 @test_x86_tbm_blci_u64_b(i64 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blci_u64_b: - ; CHECK-NOT: mov - ; CHECK: blci % - %0 = sub i64 -2, %a - %1 = or i64 %0, %a - ret i64 %1 -} - -define i32 @test_x86_tbm_blcic_u32(i32 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blcic_u32: - ; CHECK-NOT: mov - ; CHECK: blcic % - %0 = xor i32 %a, -1 - %1 = add i32 %a, 1 - %2 = and i32 %1, %0 - ret i32 %2 -} - -define i64 @test_x86_tbm_blcic_u64(i64 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blcic_u64: - ; CHECK-NOT: mov - ; CHECK: blcic % - %0 = xor i64 %a, -1 - %1 = add i64 %a, 1 - %2 = and i64 %1, %0 - ret i64 %2 -} - -define i32 @test_x86_tbm_blcmsk_u32(i32 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blcmsk_u32: - ; CHECK-NOT: mov - ; CHECK: blcmsk % - %0 = add i32 %a, 1 - %1 = xor i32 %0, %a - ret i32 %1 -} - -define i64 @test_x86_tbm_blcmsk_u64(i64 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blcmsk_u64: - ; CHECK-NOT: mov - ; CHECK: blcmsk % - %0 = add i64 %a, 1 - %1 = xor i64 %0, %a - ret i64 %1 -} - -define i32 @test_x86_tbm_blcs_u32(i32 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blcs_u32: - ; CHECK-NOT: mov - ; CHECK: blcs % - %0 = add i32 %a, 1 - %1 = or i32 %0, %a - ret i32 %1 -} - -define i64 @test_x86_tbm_blcs_u64(i64 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blcs_u64: - ; CHECK-NOT: mov - ; CHECK: blcs % - %0 = add i64 %a, 1 - %1 = or i64 %0, %a - ret i64 %1 -} - -define i32 @test_x86_tbm_blsfill_u32(i32 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blsfill_u32: - ; CHECK-NOT: mov - ; CHECK: blsfill % - %0 = add i32 %a, -1 - %1 = or i32 %0, %a - ret i32 %1 -} - -define i64 @test_x86_tbm_blsfill_u64(i64 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blsfill_u64: - ; CHECK-NOT: mov - ; CHECK: blsfill % - %0 = add i64 %a, -1 - %1 = or i64 %0, %a - ret i64 %1 -} - -define i32 @test_x86_tbm_blsic_u32(i32 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blsic_u32: - ; CHECK-NOT: mov - ; CHECK: blsic % - %0 = xor i32 %a, -1 - %1 = add i32 %a, -1 - %2 = or i32 %0, %1 - ret i32 %2 -} - -define i64 @test_x86_tbm_blsic_u64(i64 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_blsic_u64: - ; CHECK-NOT: mov - ; CHECK: blsic % - %0 = xor i64 %a, -1 - %1 = add i64 %a, -1 - %2 = or i64 %0, %1 - ret i64 %2 -} - -define i32 @test_x86_tbm_t1mskc_u32(i32 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_t1mskc_u32: - ; CHECK-NOT: mov - ; CHECK: t1mskc % - %0 = xor i32 %a, -1 - %1 = add i32 %a, 1 - %2 = or i32 %0, %1 - ret i32 %2 -} - -define i64 @Ttest_x86_tbm_t1mskc_u64(i64 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_t1mskc_u64: - ; CHECK-NOT: mov - ; CHECK: t1mskc % - %0 = xor i64 %a, -1 - %1 = add i64 %a, 1 - %2 = or i64 %0, %1 - ret i64 %2 -} - -define i32 @test_x86_tbm_tzmsk_u32(i32 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_tzmsk_u32: - ; CHECK-NOT: mov - ; CHECK: tzmsk % - %0 = xor i32 %a, -1 - %1 = add i32 %a, -1 - %2 = and i32 %0, %1 - ret i32 %2 -} - -define i64 @test_x86_tbm_tzmsk_u64(i64 %a) nounwind readnone { -entry: - ; CHECK-LABEL: test_x86_tbm_tzmsk_u64: - ; CHECK-NOT: mov - ; CHECK: tzmsk % - %0 = xor i64 %a, -1 - %1 = add i64 %a, -1 - %2 = and i64 %0, %1 - ret i64 %2 +define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_bextri_u32: +; CHECK: # BB#0: +; CHECK-NEXT: bextr $3076, %edi, %eax # imm = 0xC04 +; CHECK-NEXT: retq + %t0 = lshr i32 %a, 4 + %t1 = and i32 %t0, 4095 + ret i32 %t1 +} + +define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind { +; CHECK-LABEL: test_x86_tbm_bextri_u32_m: +; CHECK: # BB#0: +; CHECK-NEXT: bextr $3076, (%rdi), %eax # imm = 0xC04 +; CHECK-NEXT: retq + %t0 = load i32, i32* %a + %t1 = lshr i32 %t0, 4 + %t2 = and i32 %t1, 4095 + ret i32 %t2 +} + +define i64 @test_x86_tbm_bextri_u64(i64 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_bextri_u64: +; CHECK: # BB#0: +; CHECK-NEXT: bextr $3076, %edi, %eax # imm = 0xC04 +; CHECK-NEXT: retq + %t0 = lshr i64 %a, 4 + %t1 = and i64 %t0, 4095 + ret i64 %t1 +} + +define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind { +; CHECK-LABEL: test_x86_tbm_bextri_u64_m: +; CHECK: # BB#0: +; CHECK-NEXT: bextr $3076, (%rdi), %eax # imm = 0xC04 +; CHECK-NEXT: retq + %t0 = load i64, i64* %a + %t1 = lshr i64 %t0, 4 + %t2 = and i64 %t1, 4095 + ret i64 %t2 +} + +define i32 @test_x86_tbm_blcfill_u32(i32 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blcfill_u32: +; CHECK: # BB#0: +; CHECK-NEXT: blcfill %edi, %eax +; CHECK-NEXT: retq + %t0 = add i32 %a, 1 + %t1 = and i32 %t0, %a + ret i32 %t1 +} + +define i64 @test_x86_tbm_blcfill_u64(i64 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blcfill_u64: +; CHECK: # BB#0: +; CHECK-NEXT: blcfill %rdi, %rax +; CHECK-NEXT: retq + %t0 = add i64 %a, 1 + %t1 = and i64 %t0, %a + ret i64 %t1 +} + +define i32 @test_x86_tbm_blci_u32(i32 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blci_u32: +; CHECK: # BB#0: +; CHECK-NEXT: blci %edi, %eax +; CHECK-NEXT: retq + %t0 = add i32 1, %a + %t1 = xor i32 %t0, -1 + %t2 = or i32 %t1, %a + ret i32 %t2 +} + +define i64 @test_x86_tbm_blci_u64(i64 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blci_u64: +; CHECK: # BB#0: +; CHECK-NEXT: blci %rdi, %rax +; CHECK-NEXT: retq + %t0 = add i64 1, %a + %t1 = xor i64 %t0, -1 + %t2 = or i64 %t1, %a + ret i64 %t2 +} + +define i32 @test_x86_tbm_blci_u32_b(i32 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blci_u32_b: +; CHECK: # BB#0: +; CHECK-NEXT: blci %edi, %eax +; CHECK-NEXT: retq + %t0 = sub i32 -2, %a + %t1 = or i32 %t0, %a + ret i32 %t1 +} + +define i64 @test_x86_tbm_blci_u64_b(i64 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blci_u64_b: +; CHECK: # BB#0: +; CHECK-NEXT: blci %rdi, %rax +; CHECK-NEXT: retq + %t0 = sub i64 -2, %a + %t1 = or i64 %t0, %a + ret i64 %t1 +} + +define i32 @test_x86_tbm_blcic_u32(i32 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blcic_u32: +; CHECK: # BB#0: +; CHECK-NEXT: blcic %edi, %eax +; CHECK-NEXT: retq + %t0 = xor i32 %a, -1 + %t1 = add i32 %a, 1 + %t2 = and i32 %t1, %t0 + ret i32 %t2 +} + +define i64 @test_x86_tbm_blcic_u64(i64 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blcic_u64: +; CHECK: # BB#0: +; CHECK-NEXT: blcic %rdi, %rax +; CHECK-NEXT: retq + %t0 = xor i64 %a, -1 + %t1 = add i64 %a, 1 + %t2 = and i64 %t1, %t0 + ret i64 %t2 +} + +define i32 @test_x86_tbm_blcmsk_u32(i32 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blcmsk_u32: +; CHECK: # BB#0: +; CHECK-NEXT: blcmsk %edi, %eax +; CHECK-NEXT: retq + %t0 = add i32 %a, 1 + %t1 = xor i32 %t0, %a + ret i32 %t1 +} + +define i64 @test_x86_tbm_blcmsk_u64(i64 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blcmsk_u64: +; CHECK: # BB#0: +; CHECK-NEXT: blcmsk %rdi, %rax +; CHECK-NEXT: retq + %t0 = add i64 %a, 1 + %t1 = xor i64 %t0, %a + ret i64 %t1 +} + +define i32 @test_x86_tbm_blcs_u32(i32 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blcs_u32: +; CHECK: # BB#0: +; CHECK-NEXT: blcs %edi, %eax +; CHECK-NEXT: retq + %t0 = add i32 %a, 1 + %t1 = or i32 %t0, %a + ret i32 %t1 +} + +define i64 @test_x86_tbm_blcs_u64(i64 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blcs_u64: +; CHECK: # BB#0: +; CHECK-NEXT: blcs %rdi, %rax +; CHECK-NEXT: retq + %t0 = add i64 %a, 1 + %t1 = or i64 %t0, %a + ret i64 %t1 +} + +define i32 @test_x86_tbm_blsfill_u32(i32 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blsfill_u32: +; CHECK: # BB#0: +; CHECK-NEXT: blsfill %edi, %eax +; CHECK-NEXT: retq + %t0 = add i32 %a, -1 + %t1 = or i32 %t0, %a + ret i32 %t1 +} + +define i64 @test_x86_tbm_blsfill_u64(i64 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blsfill_u64: +; CHECK: # BB#0: +; CHECK-NEXT: blsfill %rdi, %rax +; CHECK-NEXT: retq + %t0 = add i64 %a, -1 + %t1 = or i64 %t0, %a + ret i64 %t1 +} + +define i32 @test_x86_tbm_blsic_u32(i32 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blsic_u32: +; CHECK: # BB#0: +; CHECK-NEXT: blsic %edi, %eax +; CHECK-NEXT: retq + %t0 = xor i32 %a, -1 + %t1 = add i32 %a, -1 + %t2 = or i32 %t0, %t1 + ret i32 %t2 +} + +define i64 @test_x86_tbm_blsic_u64(i64 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_blsic_u64: +; CHECK: # BB#0: +; CHECK-NEXT: blsic %rdi, %rax +; CHECK-NEXT: retq + %t0 = xor i64 %a, -1 + %t1 = add i64 %a, -1 + %t2 = or i64 %t0, %t1 + ret i64 %t2 +} + +define i32 @test_x86_tbm_t1mskc_u32(i32 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_t1mskc_u32: +; CHECK: # BB#0: +; CHECK-NEXT: t1mskc %edi, %eax +; CHECK-NEXT: retq + %t0 = xor i32 %a, -1 + %t1 = add i32 %a, 1 + %t2 = or i32 %t0, %t1 + ret i32 %t2 +} + +define i64 @Ttest_x86_tbm_t1mskc_u64(i64 %a) nounwind { +; CHECK-LABEL: Ttest_x86_tbm_t1mskc_u64: +; CHECK: # BB#0: +; CHECK-NEXT: t1mskc %rdi, %rax +; CHECK-NEXT: retq + %t0 = xor i64 %a, -1 + %t1 = add i64 %a, 1 + %t2 = or i64 %t0, %t1 + ret i64 %t2 +} + +define i32 @test_x86_tbm_tzmsk_u32(i32 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_tzmsk_u32: +; CHECK: # BB#0: +; CHECK-NEXT: tzmsk %edi, %eax +; CHECK-NEXT: retq + %t0 = xor i32 %a, -1 + %t1 = add i32 %a, -1 + %t2 = and i32 %t0, %t1 + ret i32 %t2 +} + +define i64 @test_x86_tbm_tzmsk_u64(i64 %a) nounwind { +; CHECK-LABEL: test_x86_tbm_tzmsk_u64: +; CHECK: # BB#0: +; CHECK-NEXT: tzmsk %rdi, %rax +; CHECK-NEXT: retq + %t0 = xor i64 %a, -1 + %t1 = add i64 %a, -1 + %t2 = and i64 %t0, %t1 + ret i64 %t2 } + diff --git a/test/CodeGen/X86/vec-copysign.ll b/test/CodeGen/X86/vec-copysign.ll index d363dbdaef81f..1ebd7ceafced8 100644 --- a/test/CodeGen/X86/vec-copysign.ll +++ b/test/CodeGen/X86/vec-copysign.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.10.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 --check-prefix=CHECK ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.10.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=CHECK -; Assertions have been enhanced from utils/update_test_checks.py to show the constant pool values. +; Assertions have been enhanced from utils/update_llc_test_checks.py to show the constant pool values. ; Use a macosx triple to make sure the format of those constant strings is exact. ; CHECK: [[SIGNMASK1:L.+]]: diff --git a/test/CodeGen/X86/vec_return.ll b/test/CodeGen/X86/vec_return.ll index f7fcd032cab36..556e32d0c87b9 100644 --- a/test/CodeGen/X86/vec_return.ll +++ b/test/CodeGen/X86/vec_return.ll @@ -1,16 +1,21 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s ; Without any typed operations, always use the smaller xorps. -; CHECK: test -; CHECK: xorps define <2 x double> @test() { +; CHECK-LABEL: test: +; CHECK: # BB#0: +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: retl ret <2 x double> zeroinitializer } ; Prefer a constant pool load here. -; CHECK: test2 -; CHECK-NOT: shuf -; CHECK: movaps {{.*}}{{CPI|__xmm@}} define <4 x i32> @test2() nounwind { +; CHECK-LABEL: test2: +; CHECK: # BB#0: +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,0,1,0] +; CHECK-NEXT: retl ret <4 x i32> < i32 0, i32 0, i32 1, i32 0 > } + diff --git a/test/CodeGen/X86/vec_shift6.ll b/test/CodeGen/X86/vec_shift6.ll index b4a58deff2f8d..731760a4ea55e 100644 --- a/test/CodeGen/X86/vec_shift6.ll +++ b/test/CodeGen/X86/vec_shift6.ll @@ -153,14 +153,16 @@ define <32 x i16> @test7(<32 x i16> %a) { ; ; AVX2-LABEL: test7: ; AVX2: # BB#0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test7: ; AVX512: # BB#0: -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] +; AVX512-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512-NEXT: retq @@ -183,7 +185,8 @@ define <16 x i32> @test8(<16 x i32> %a) { ; ; AVX2-LABEL: test8: ; AVX2: # BB#0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vec_unsafe-fp-math.ll b/test/CodeGen/X86/vec_unsafe-fp-math.ll index 1c352782fca4f..745316effc98b 100644 --- a/test/CodeGen/X86/vec_unsafe-fp-math.ll +++ b/test/CodeGen/X86/vec_unsafe-fp-math.ll @@ -1,13 +1,13 @@ -; RUN: llc < %s -enable-unsafe-fp-math -enable-no-signed-zeros-fp-math -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -enable-unsafe-fp-math -enable-no-signed-zeros-fp-math -mtriple=x86_64-unknown-unknown | FileCheck %s ; Make sure that vectors get the same benefits as scalars when using unsafe-fp-math. ; Subtracting zero is free. define <4 x float> @vec_fsub_zero(<4 x float> %x) { ; CHECK-LABEL: vec_fsub_zero: -; CHECK-NOT: subps -; CHECK-NOT: xorps -; CHECK: retq +; CHECK: # BB#0: +; CHECK-NEXT: retq %sub = fsub <4 x float> %x, zeroinitializer ret <4 x float> %sub } @@ -15,9 +15,10 @@ define <4 x float> @vec_fsub_zero(<4 x float> %x) { ; Negating doesn't require subtraction. define <4 x float> @vec_fneg(<4 x float> %x) { ; CHECK-LABEL: vec_fneg: -; CHECK: xorps {{.*}}LCP{{.*}}, %xmm0 -; CHECK-NOT: subps -; CHECK-NEXT: retq +; CHECK: # BB#0: +; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq %sub = fsub <4 x float> zeroinitializer, %x ret <4 x float> %sub } + diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll index adda108bdc777..d2f33785530b4 100644 --- a/test/CodeGen/X86/vector-popcnt-128.ll +++ b/test/CodeGen/X86/vector-popcnt-128.ll @@ -344,20 +344,43 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv8i16: -; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv8i16: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i16: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv8i16: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in) ret <8 x i16> %out } @@ -431,17 +454,37 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv16i8: -; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv16i8: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i8: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VPOPCNTDQ-LABEL: testv16i8: +; AVX512VPOPCNTDQ: # BB#0: +; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper +; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in) ret <16 x i8> %out } diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll index accbad35e9d72..4c5de2fed3852 100644 --- a/test/CodeGen/X86/vector-popcnt-256.ll +++ b/test/CodeGen/X86/vector-popcnt-256.ll @@ -155,17 +155,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv16i16: ; AVX512VPOPCNTDQ: # BB#0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in) ret <16 x i16> %out diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll index aa50206e7a5ee..a6f4e33428973 100644 --- a/test/CodeGen/X86/vector-popcnt-512.ll +++ b/test/CodeGen/X86/vector-popcnt-512.ll @@ -1,11 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ --check-prefix=AVX512VPOPCNTDQ-NOBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ --check-prefix=AVX512VPOPCNTDQ-BW define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512F-LABEL: testv8i64: -; AVX512F: ## BB#0: +; AVX512F: # BB#0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 @@ -28,7 +29,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: testv8i64: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -42,7 +43,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv8i64: -; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ: # BB#0: ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq %out = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %in) @@ -51,7 +52,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512F-LABEL: testv16i32: -; AVX512F: ## BB#0: +; AVX512F: # BB#0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 @@ -82,7 +83,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: testv16i32: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -100,7 +101,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv16i32: -; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ: # BB#0: ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq %out = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %in) @@ -109,7 +110,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512F-LABEL: testv32i16: -; AVX512F: ## BB#0: +; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -133,7 +134,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: testv32i16: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -147,36 +148,37 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; -; AVX512VPOPCNTDQ-LABEL: testv32i16: -; AVX512VPOPCNTDQ: ## BB#0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: retq +; AVX512VPOPCNTDQ-NOBW-LABEL: testv32i16: +; AVX512VPOPCNTDQ-NOBW: # BB#0: +; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: retq +; +; AVX512VPOPCNTDQ-BW-LABEL: testv32i16: +; AVX512VPOPCNTDQ-BW: # BB#0: +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 +; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-BW-NEXT: vpsllw $8, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 +; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-BW-NEXT: retq %out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in) ret <32 x i16> %out } define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512F-LABEL: testv64i8: -; AVX512F: ## BB#0: +; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -194,7 +196,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: testv64i8: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -205,23 +207,35 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; -; AVX512VPOPCNTDQ-LABEL: testv64i8: -; AVX512VPOPCNTDQ: ## BB#0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: retq +; AVX512VPOPCNTDQ-NOBW-LABEL: testv64i8: +; AVX512VPOPCNTDQ-NOBW: # BB#0: +; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: retq +; +; AVX512VPOPCNTDQ-BW-LABEL: testv64i8: +; AVX512VPOPCNTDQ-BW: # BB#0: +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 +; AVX512VPOPCNTDQ-BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 +; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-BW-NEXT: retq %out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in) ret <64 x i8> %out } diff --git a/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll b/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll new file mode 100644 index 0000000000000..af69a5ac22839 --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE42 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,+sse4a| FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; +; Combine tests involving SSE4A target shuffles (EXTRQI,INSERTQI) + +declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) + +define <16 x i8> @combine_extrqi_pshufb_16i8(<16 x i8> %a0) { +; ALL-LABEL: combine_extrqi_pshufb_16i8: +; ALL: # BB#0: +; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[1,2],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 2, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) + ret <16 x i8> %2 +} + +define <8 x i16> @combine_extrqi_pshufb_8i16(<8 x i16> %a0) { +; ALL-LABEL: combine_extrqi_pshufb_8i16: +; ALL: # BB#0: +; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %1 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = bitcast <8 x i16> %1 to <16 x i8> + %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) + %4 = bitcast <16 x i8> %3 to <8 x i16> + ret <8 x i16> %4 +} + +define <16 x i8> @combine_insertqi_pshufb_16i8(<16 x i8> %a0, <16 x i8> %a1) { +; SSSE3-LABEL: combine_insertqi_pshufb_16i8: +; SSSE3: # BB#0: +; SSSE3-NEXT: extrq {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE42-LABEL: combine_insertqi_pshufb_16i8: +; SSE42: # BB#0: +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE42-NEXT: retq +; +; AVX-LABEL: combine_insertqi_pshufb_16i8: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: retq + %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) + ret <16 x i8> %2 +} + +define <8 x i16> @combine_insertqi_pshufb_8i16(<8 x i16> %a0, <8 x i16> %a1) { +; SSSE3-LABEL: combine_insertqi_pshufb_8i16: +; SSSE3: # BB#0: +; SSSE3-NEXT: extrq {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE42-LABEL: combine_insertqi_pshufb_8i16: +; SSE42: # BB#0: +; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE42-NEXT: retq +; +; AVX-LABEL: combine_insertqi_pshufb_8i16: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: retq + %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> + %2 = bitcast <8 x i16> %1 to <16 x i8> + %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) + %4 = bitcast <16 x i8> %3 to <8 x i16> + ret <8 x i16> %4 +} + +define <16 x i8> @combine_pshufb_insertqi_pshufb(<16 x i8> %a0, <16 x i8> %a1) { +; ALL-LABEL: combine_pshufb_insertqi_pshufb: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0,1],xmm0[3,4,5,6,7,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) + %2 = shufflevector <16 x i8> %1, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 17, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 7, i8 1, i8 2, i8 4, i8 3, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) + ret <16 x i8> %3 +} diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index 546b731260396..02314857c6d7e 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -445,6 +445,21 @@ define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) { ret <16 x i8> %res1 } +define <16 x i8> @combine_vpshufb_as_pshuflw_not_pslld(<16 x i8> *%a0) { +; SSE-LABEL: combine_vpshufb_as_pshuflw_not_pslld: +; SSE: # BB#0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_vpshufb_as_pshuflw_not_pslld: +; AVX: # BB#0: +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7] +; AVX-NEXT: retq + %res0 = load <16 x i8>, <16 x i8> *%a0, align 16 + %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) + ret <16 x i8> %res1 +} + define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) { ; SSE-LABEL: combine_pshufb_as_unary_unpcklbw: ; SSE: # BB#0: diff --git a/test/CodeGen/X86/vector-shuffle-sse4a.ll b/test/CodeGen/X86/vector-shuffle-sse4a.ll index 138c421215f4f..e458bb6fa52ff 100644 --- a/test/CodeGen/X86/vector-shuffle-sse4a.ll +++ b/test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=AMD10H ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=BTVER2 @@ -10,7 +11,6 @@ define <2 x i64> @extrqi_len0_idx0(<2 x i64> %a) { ; ALL-LABEL: extrqi_len0_idx0: ; ALL: # BB#0: -; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,u,u,u,u,u,u,u,u] ; ALL-NEXT: retq %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a, i8 0, i8 0) ret <2 x i64> %1 @@ -36,6 +36,11 @@ define <2 x i64> @extrqi_len32_idx48(<2 x i64> %a) { } define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) { +; AMD10H-LABEL: shuf_0zzzuuuuuuuuuuuu: +; AMD10H: # BB#0: +; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AMD10H-NEXT: retq +; ; BTVER1-LABEL: shuf_0zzzuuuuuuuuuuuu: ; BTVER1: # BB#0: ; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] @@ -50,12 +55,17 @@ define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) { } define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) { +; AMD10H-LABEL: shuf_0zzzzzzz1zzzzzzz: +; AMD10H: # BB#0: +; AMD10H-NEXT: movdqa %xmm0, %xmm1 +; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AMD10H-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AMD10H-NEXT: retq +; ; BTVER1-LABEL: shuf_0zzzzzzz1zzzzzzz: ; BTVER1: # BB#0: -; BTVER1-NEXT: movdqa %xmm0, %xmm1 -; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; BTVER1-NEXT: retq ; ; BTVER2-LABEL: shuf_0zzzzzzz1zzzzzzz: @@ -67,12 +77,17 @@ define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) { } define <16 x i8> @shuf_2zzzzzzz3zzzzzzz(<16 x i8> %a0) { +; AMD10H-LABEL: shuf_2zzzzzzz3zzzzzzz: +; AMD10H: # BB#0: +; AMD10H-NEXT: movdqa %xmm0, %xmm1 +; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[3],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AMD10H-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AMD10H-NEXT: retq +; ; BTVER1-LABEL: shuf_2zzzzzzz3zzzzzzz: ; BTVER1: # BB#0: -; BTVER1-NEXT: movdqa %xmm0, %xmm1 -; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[3],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; BTVER1-NEXT: retq ; ; BTVER2-LABEL: shuf_2zzzzzzz3zzzzzzz: @@ -85,6 +100,11 @@ define <16 x i8> @shuf_2zzzzzzz3zzzzzzz(<16 x i8> %a0) { } define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) { +; AMD10H-LABEL: shuf_01zzuuuuuuuuuuuu: +; AMD10H: # BB#0: +; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AMD10H-NEXT: retq +; ; BTVER1-LABEL: shuf_01zzuuuuuuuuuuuu: ; BTVER1: # BB#0: ; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] @@ -99,12 +119,17 @@ define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) { } define <16 x i8> @shuf_01zzzzzz23zzzzzz(<16 x i8> %a0) { +; AMD10H-LABEL: shuf_01zzzzzz23zzzzzz: +; AMD10H: # BB#0: +; AMD10H-NEXT: movdqa %xmm0, %xmm1 +; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AMD10H-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AMD10H-NEXT: retq +; ; BTVER1-LABEL: shuf_01zzzzzz23zzzzzz: ; BTVER1: # BB#0: -; BTVER1-NEXT: movdqa %xmm0, %xmm1 -; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero,zero,zero,zero,zero ; BTVER1-NEXT: retq ; ; BTVER2-LABEL: shuf_01zzzzzz23zzzzzz: @@ -143,21 +168,37 @@ define <8 x i16> @shuf_12zzuuuu(<8 x i16> %a0) { } define <8 x i16> @shuf_012zuuuu(<8 x i16> %a0) { -; ALL-LABEL: shuf_012zuuuu: -; ALL: # BB#0: -; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u] -; ALL-NEXT: retq +; AMD10H-LABEL: shuf_012zuuuu: +; AMD10H: # BB#0: +; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AMD10H-NEXT: retq +; +; BTVER1-LABEL: shuf_012zuuuu: +; BTVER1: # BB#0: +; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: retq +; +; BTVER2-LABEL: shuf_012zuuuu: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; BTVER2-NEXT: retq %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %s } define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) { +; AMD10H-LABEL: shuf_0zzz1zzz: +; AMD10H: # BB#0: +; AMD10H-NEXT: movdqa %xmm0, %xmm1 +; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AMD10H-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AMD10H-NEXT: retq +; ; BTVER1-LABEL: shuf_0zzz1zzz: ; BTVER1: # BB#0: -; BTVER1-NEXT: movdqa %xmm0, %xmm1 -; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] -; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero,zero,zero,zero,zero ; BTVER1-NEXT: retq ; ; BTVER2-LABEL: shuf_0zzz1zzz: @@ -169,6 +210,12 @@ define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) { } define <4 x i32> @shuf_0z1z(<4 x i32> %a0) { +; AMD10H-LABEL: shuf_0z1z: +; AMD10H: # BB#0: +; AMD10H-NEXT: pxor %xmm1, %xmm1 +; AMD10H-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AMD10H-NEXT: retq +; ; BTVER1-LABEL: shuf_0z1z: ; BTVER1: # BB#0: ; BTVER1-NEXT: pxor %xmm1, %xmm1 @@ -189,10 +236,20 @@ define <4 x i32> @shuf_0z1z(<4 x i32> %a0) { ; A length of zero is equivalent to a bit length of 64. define <2 x i64> @insertqi_len0_idx0(<2 x i64> %a, <2 x i64> %b) { -; ALL-LABEL: insertqi_len0_idx0: -; ALL: # BB#0: -; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6,7],xmm0[u,u,u,u,u,u,u,u] -; ALL-NEXT: retq +; AMD10H-LABEL: insertqi_len0_idx0: +; AMD10H: # BB#0: +; AMD10H-NEXT: movaps %xmm1, %xmm0 +; AMD10H-NEXT: retq +; +; BTVER1-LABEL: insertqi_len0_idx0: +; BTVER1: # BB#0: +; BTVER1-NEXT: movaps %xmm1, %xmm0 +; BTVER1-NEXT: retq +; +; BTVER2-LABEL: insertqi_len0_idx0: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovaps %xmm1, %xmm0 +; BTVER2-NEXT: retq %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a, <2 x i64> %b, i8 0, i8 0) ret <2 x i64> %1 } @@ -303,6 +360,15 @@ define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) { ; Out of range. define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) { +; AMD10H-LABEL: shuffle_8_18_uuuuuuuuuuuuuu: +; AMD10H: # BB#0: +; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AMD10H-NEXT: andpd {{.*}}(%rip), %xmm0 +; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AMD10H-NEXT: packuswb %xmm0, %xmm0 +; AMD10H-NEXT: retq +; ; BTVER1-LABEL: shuffle_8_18_uuuuuuuuuuuuuu: ; BTVER1: # BB#0: ; BTVER1-NEXT: psrld $16, %xmm1 @@ -321,6 +387,13 @@ define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) { } define <16 x i8> @shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %v) { +; AMD10H-LABEL: shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AMD10H: # BB#0: +; AMD10H-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; AMD10H-NEXT: retq +; ; BTVER1-LABEL: shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; BTVER1: # BB#0: ; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,5,5,4,4,5,5,4,4,5,5,6,6,7,7] @@ -335,6 +408,12 @@ define <16 x i8> @shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8 } define <16 x i8> @shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %v) { +; AMD10H-LABEL: shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AMD10H: # BB#0: +; AMD10H-NEXT: psrlq $16, %xmm0 +; AMD10H-NEXT: pand {{.*}}(%rip), %xmm0 +; AMD10H-NEXT: retq +; ; BTVER1-LABEL: shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; BTVER1: # BB#0: ; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u],zero,xmm0[4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] diff --git a/test/CodeGen/X86/vector-truncate-combine.ll b/test/CodeGen/X86/vector-truncate-combine.ll index 1a6dac8fa6e41..61808b802517d 100644 --- a/test/CodeGen/X86/vector-truncate-combine.ll +++ b/test/CodeGen/X86/vector-truncate-combine.ll @@ -11,14 +11,14 @@ ; preservation of the extend/truncate operations mentioned above (2 extend and ; 3 truncate instructions). ; -; NOTE: This operation could be collapsed in to a single truncate. Once that is done -; this test will have to be adjusted. +; NOTE: This operation is collapsed to a single truncate, so this test no longer covers +; what it originally intended to. -; CHECK: PUNPCKLBWrr -; CHECK: PUNPCKLWDrr -; CHECK: PACKUSWBrr +; CHECK: MOVLHPSrr +; CHECK: PSHUFHWri ; CHECK: PACKUSWBrr ; CHECK: PACKUSWBrr +; CHECK: MOVPDI2DIrr define void @test(double %vec.coerce) local_unnamed_addr { entry: diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll index 4b5a00a30d097..820178d2d9927 100644 --- a/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/test/CodeGen/X86/vector-tzcnt-128.ll @@ -928,17 +928,10 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; X32-SSE-LABEL: testv8i16: @@ -1095,17 +1088,10 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; X32-SSE-LABEL: testv8i16u: @@ -1243,14 +1229,10 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; X32-SSE-LABEL: testv16i8: @@ -1384,14 +1366,10 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq ; ; X32-SSE-LABEL: testv16i8u: diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll index 16192ec61a550..30e5661d54859 100644 --- a/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/test/CodeGen/X86/vector-tzcnt-256.ll @@ -584,17 +584,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; X32-AVX-LABEL: testv16i16: @@ -722,17 +714,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; X32-AVX-LABEL: testv16i16u: diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll index 760216d561c4e..3bf677aadf195 100644 --- a/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/test/CodeGen/X86/vector-tzcnt-512.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,-avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=-avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512CD-LABEL: testv8i64: -; AVX512CD: ## BB#0: +; AVX512CD: # BB#0: ; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1 ; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -34,7 +34,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv8i64: -; AVX512CDBW: ## BB#0: +; AVX512CDBW: # BB#0: ; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm2 ; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0 @@ -52,7 +52,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512CDBW-NEXT: retq ; ; AVX512BW-LABEL: testv8i64: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 @@ -70,7 +70,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv8i64: -; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ: # BB#0: ; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -84,7 +84,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512CD-LABEL: testv8i64u: -; AVX512CD: ## BB#0: +; AVX512CD: # BB#0: ; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1 ; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -94,7 +94,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv8i64u: -; AVX512CDBW: ## BB#0: +; AVX512CDBW: # BB#0: ; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -104,7 +104,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512CDBW-NEXT: retq ; ; AVX512BW-LABEL: testv8i64u: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 @@ -122,7 +122,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv8i64u: -; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ: # BB#0: ; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -136,7 +136,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512CD-LABEL: testv16i32: -; AVX512CD: ## BB#0: +; AVX512CD: # BB#0: ; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1 ; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0 @@ -172,7 +172,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv16i32: -; AVX512CDBW: ## BB#0: +; AVX512CDBW: # BB#0: ; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm2 ; AVX512CDBW-NEXT: vpandd %zmm2, %zmm0, %zmm0 @@ -194,7 +194,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512CDBW-NEXT: retq ; ; AVX512BW-LABEL: testv16i32: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 @@ -216,7 +216,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv16i32: -; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ: # BB#0: ; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0 @@ -230,7 +230,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512CD-LABEL: testv16i32u: -; AVX512CD: ## BB#0: +; AVX512CD: # BB#0: ; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1 ; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0 @@ -240,7 +240,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv16i32u: -; AVX512CDBW: ## BB#0: +; AVX512CDBW: # BB#0: ; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpandd %zmm1, %zmm0, %zmm0 @@ -250,7 +250,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512CDBW-NEXT: retq ; ; AVX512BW-LABEL: testv16i32u: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 @@ -272,7 +272,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv16i32u: -; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ: # BB#0: ; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0 @@ -286,7 +286,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512CD-LABEL: testv32i16: -; AVX512CD: ## BB#0: +; AVX512CD: # BB#0: ; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3 ; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 @@ -318,7 +318,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv32i16: -; AVX512CDBW: ## BB#0: +; AVX512CDBW: # BB#0: ; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -338,7 +338,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512CDBW-NEXT: retq ; ; AVX512BW-LABEL: testv32i16: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -358,35 +358,21 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv32i16: -; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ: # BB#0: ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0) ret <32 x i16> %out @@ -394,7 +380,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512CD-LABEL: testv32i16u: -; AVX512CD: ## BB#0: +; AVX512CD: # BB#0: ; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3 ; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 @@ -426,7 +412,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv32i16u: -; AVX512CDBW: ## BB#0: +; AVX512CDBW: # BB#0: ; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -446,7 +432,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512CDBW-NEXT: retq ; ; AVX512BW-LABEL: testv32i16u: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -466,35 +452,21 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv32i16u: -; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ: # BB#0: ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm0, %ymm5 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm0, %ymm5, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpsllw $8, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: retq %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1) ret <32 x i16> %out @@ -502,7 +474,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512CD-LABEL: testv64i8: -; AVX512CD: ## BB#0: +; AVX512CD: # BB#0: ; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3 ; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 @@ -528,7 +500,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv64i8: -; AVX512CDBW: ## BB#0: +; AVX512CDBW: # BB#0: ; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -545,7 +517,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512CDBW-NEXT: retq ; ; AVX512BW-LABEL: testv64i8: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -562,7 +534,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv64i8: -; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ: # BB#0: ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 @@ -592,7 +564,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512CD-LABEL: testv64i8u: -; AVX512CD: ## BB#0: +; AVX512CD: # BB#0: ; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3 ; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 @@ -618,7 +590,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv64i8u: -; AVX512CDBW: ## BB#0: +; AVX512CDBW: # BB#0: ; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -635,7 +607,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512CDBW-NEXT: retq ; ; AVX512BW-LABEL: testv64i8u: -; AVX512BW: ## BB#0: +; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -652,7 +624,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv64i8u: -; AVX512VPOPCNTDQ: ## BB#0: +; AVX512VPOPCNTDQ: # BB#0: ; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 diff --git a/test/CodeGen/X86/wide-integer-cmp.ll b/test/CodeGen/X86/wide-integer-cmp.ll index b5c7f86567a13..182d7cc73c9aa 100644 --- a/test/CodeGen/X86/wide-integer-cmp.ll +++ b/test/CodeGen/X86/wide-integer-cmp.ll @@ -101,8 +101,8 @@ define i32 @test_wide(i128 %a, i128 %b) { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: jge .LBB4_2 ; CHECK-NEXT: # BB#1: # %bb1 ; CHECK-NEXT: movl $1, %eax diff --git a/test/CodeGen/X86/x32-lea-1.ll b/test/CodeGen/X86/x32-lea-1.ll index 2f7d71e2baf1b..afe3581a85bce 100644 --- a/test/CodeGen/X86/x32-lea-1.ll +++ b/test/CodeGen/X86/x32-lea-1.ll @@ -1,10 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -O0 | FileCheck %s -; CHECK: leal {{[-0-9]*}}(%r{{s|b}}p), -; CHECK-NOT: leal {{[-0-9]*}}(%e{{s|b}}p), define void @foo(i32** %p) { +; CHECK-LABEL: foo: +; CHECK: # BB#0: +; CHECK-NEXT: leal -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: addl $16, %eax +; CHECK-NEXT: movl %eax, (%edi) +; CHECK-NEXT: retq %a = alloca i32, i32 10 %addr = getelementptr i32, i32* %a, i32 4 store i32* %addr, i32** %p ret void } + diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index 1263605a6dc03..5f85975fdb5ce 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -1,9 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: llc -mtriple=x86_64-pc-linux -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc -mtriple=x86_64-pc-linux -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX3 define <4 x double> @load_factorf64_4(<16 x double>* %ptr) { +; AVX1-LABEL: load_factorf64_4: +; AVX1: # BB#0: +; AVX1-NEXT: vmovupd (%rdi), %ymm0 +; AVX1-NEXT: vmovupd 32(%rdi), %ymm1 +; AVX1-NEXT: vmovupd 64(%rdi), %ymm2 +; AVX1-NEXT: vmovupd 96(%rdi), %ymm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] +; AVX1-NEXT: vhaddpd %ymm5, %ymm4, %ymm4 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-NEXT: vaddpd %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: retq +; ; AVX-LABEL: load_factorf64_4: ; AVX: # BB#0: ; AVX-NEXT: vmovupd (%rdi), %ymm0 @@ -32,6 +49,21 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) { } define <4 x double> @load_factorf64_2(<16 x double>* %ptr) { +; AVX1-LABEL: load_factorf64_2: +; AVX1: # BB#0: +; AVX1-NEXT: vmovupd (%rdi), %ymm0 +; AVX1-NEXT: vmovupd 32(%rdi), %ymm1 +; AVX1-NEXT: vmovupd 64(%rdi), %ymm2 +; AVX1-NEXT: vmovupd 96(%rdi), %ymm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-NEXT: vmulpd %ymm0, %ymm4, %ymm0 +; AVX1-NEXT: retq +; ; AVX-LABEL: load_factorf64_2: ; AVX: # BB#0: ; AVX-NEXT: vmovupd (%rdi), %ymm0 @@ -54,6 +86,16 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) { } define <4 x double> @load_factorf64_1(<16 x double>* %ptr) { +; AVX1-LABEL: load_factorf64_1: +; AVX1: # BB#0: +; AVX1-NEXT: vmovupd (%rdi), %ymm0 +; AVX1-NEXT: vmovupd 32(%rdi), %ymm1 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],mem[0,1] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-NEXT: vmulpd %ymm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; ; AVX-LABEL: load_factorf64_1: ; AVX: # BB#0: ; AVX-NEXT: vmovupd (%rdi), %ymm0 @@ -98,24 +140,24 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) { ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: load_factori64_4: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqu 64(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu 96(%rdi), %ymm3 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: load_factori64_4: +; AVX: # BB#0: +; AVX-NEXT: vmovdqu (%rdi), %ymm0 +; AVX-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX-NEXT: vmovdqu 64(%rdi), %ymm2 +; AVX-NEXT: vmovdqu 96(%rdi), %ymm3 +; AVX-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1] +; AVX-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1] +; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX-NEXT: vpaddq %ymm3, %ymm4, %ymm3 +; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX-NEXT: vpaddq %ymm0, %ymm3, %ymm0 +; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX-NEXT: retq %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16 %strided.v0 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> %strided.v1 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> @@ -128,6 +170,23 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) { } define void @store_factorf64_4(<16 x double>* %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) { +; AVX1-LABEL: store_factorf64_4: +; AVX1: # BB#0: +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-NEXT: vmovupd %ymm0, 96(%rdi) +; AVX1-NEXT: vmovupd %ymm3, 64(%rdi) +; AVX1-NEXT: vmovupd %ymm4, 32(%rdi) +; AVX1-NEXT: vmovupd %ymm2, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX-LABEL: store_factorf64_4: ; AVX: # BB#0: ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 @@ -169,22 +228,22 @@ define void @store_factori64_4(<16 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1, < ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: store_factori64_4: -; AVX2: # BB#0: -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi) -; AVX2-NEXT: vmovdqu %ymm3, 64(%rdi) -; AVX2-NEXT: vmovdqu %ymm4, 32(%rdi) -; AVX2-NEXT: vmovdqu %ymm2, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: store_factori64_4: +; AVX: # BB#0: +; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; AVX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5 +; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX-NEXT: vmovdqu %ymm0, 96(%rdi) +; AVX-NEXT: vmovdqu %ymm3, 64(%rdi) +; AVX-NEXT: vmovdqu %ymm4, 32(%rdi) +; AVX-NEXT: vmovdqu %ymm2, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> @@ -252,54 +311,54 @@ define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: interleaved_store_vf32_i8_stride4: -; AVX2: # BB#0: -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7],ymm6[8],ymm5[9],ymm6[10],ymm5[11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3],ymm4[4],ymm6[5],ymm4[6],ymm6[7],ymm4[8],ymm6[9],ymm4[10],ymm6[11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi) -; AVX2-NEXT: vmovdqa %ymm4, 64(%rdi) -; AVX2-NEXT: vmovdqa %ymm5, 32(%rdi) -; AVX2-NEXT: vmovdqa %ymm8, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: interleaved_store_vf32_i8_stride4: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7],ymm6[8],ymm5[9],ymm6[10],ymm5[11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] +; AVX-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero +; AVX-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3],ymm4[4],ymm6[5],ymm4[6],ymm6[7],ymm4[8],ymm6[9],ymm4[10],ymm6[11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX-NEXT: vmovdqa %ymm0, 96(%rdi) +; AVX-NEXT: vmovdqa %ymm4, 64(%rdi) +; AVX-NEXT: vmovdqa %ymm5, 32(%rdi) +; AVX-NEXT: vmovdqa %ymm8, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> %v2 = shufflevector <32 x i8> %x3, <32 x i8> %x4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> %interleaved.vec = shufflevector <64 x i8> %v1, <64 x i8> %v2, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127> diff --git a/test/CodeGen/X86/zext-shl.ll b/test/CodeGen/X86/zext-shl.ll index ac3ecc85f2d90..7722f46d753a6 100644 --- a/test/CodeGen/X86/zext-shl.ll +++ b/test/CodeGen/X86/zext-shl.ll @@ -1,25 +1,26 @@ -; RUN: llc < %s -march=x86 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s -define i32 @t1(i8 zeroext %x) nounwind readnone ssp { -entry: +define i32 @t1(i8 zeroext %x) nounwind { ; CHECK-LABEL: t1: -; CHECK: shll -; CHECK-NOT: movzwl -; CHECK: ret - %0 = zext i8 %x to i16 - %1 = shl i16 %0, 5 - %2 = zext i16 %1 to i32 - ret i32 %2 +; CHECK: # BB#0: +; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: shll $5, %eax +; CHECK-NEXT: retl + %t0 = zext i8 %x to i16 + %t1 = shl i16 %t0, 5 + %t2 = zext i16 %t1 to i32 + ret i32 %t2 } -define i32 @t2(i8 zeroext %x) nounwind readnone ssp { -entry: +define i32 @t2(i8 zeroext %x) nounwind { ; CHECK-LABEL: t2: -; CHECK: shrl -; CHECK-NOT: movzwl -; CHECK: ret - %0 = zext i8 %x to i16 - %1 = lshr i16 %0, 3 - %2 = zext i16 %1 to i32 - ret i32 %2 +; CHECK: # BB#0: +; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: shrl $3, %eax +; CHECK-NEXT: retl + %t0 = zext i8 %x to i16 + %t1 = lshr i16 %t0, 3 + %t2 = zext i16 %t1 to i32 + ret i32 %t2 } diff --git a/test/CodeGen/X86/zext-trunc.ll b/test/CodeGen/X86/zext-trunc.ll index 32afd6b96a8b7..e51a77abc92e1 100644 --- a/test/CodeGen/X86/zext-trunc.ll +++ b/test/CodeGen/X86/zext-trunc.ll @@ -1,11 +1,12 @@ -; RUN: llc < %s -march=x86-64 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s ; rdar://7570931 define i64 @foo(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: foo: -; CHECK: leal -; CHECK-NOT: movl -; CHECK: ret +; CHECK: # BB#0: +; CHECK-NEXT: leal (%rdi,%rsi), %eax +; CHECK-NEXT: retq %c = add i64 %a, %b %d = trunc i64 %c to i32 %e = zext i32 %d to i64 diff --git a/test/DebugInfo/COFF/asm.ll b/test/DebugInfo/COFF/asm.ll index 3d245e9d396d3..a55eec2782a6a 100644 --- a/test/DebugInfo/COFF/asm.ll +++ b/test/DebugInfo/COFF/asm.ll @@ -35,7 +35,7 @@ ; OBJ32: CodeViewDebugInfo [ ; OBJ32: Subsection [ ; OBJ32-NEXT: SubSectionType: Symbols (0xF1) -; OBJ32: ProcStart { +; OBJ32: {{.*}}Proc{{.*}}Sym { ; OBJ32: CodeSize: 0x6 ; OBJ32: DisplayName: f ; OBJ32: LinkageName: _f @@ -94,13 +94,13 @@ ; OBJ64: ] ; OBJ64: Subsection [ ; OBJ64-NEXT: SubSectionType: Symbols (0xF1) -; OBJ64: ProcStart { +; OBJ64: {{.*}}Proc{{.*}}Sym { ; OBJ64: CodeSize: 0xE ; OBJ64: DisplayName: f ; OBJ64: LinkageName: f ; OBJ64: } ; OBJ64-NEXT: ProcEnd { -; OBJ64-NEXT: } +; OBJ64: } ; OBJ64-NEXT: ] ; OBJ64: FunctionLineTable [ ; OBJ64-NEXT: Name: f diff --git a/test/DebugInfo/COFF/cpp-mangling.ll b/test/DebugInfo/COFF/cpp-mangling.ll index 8d1a136ec5fc1..6f8b5a21ffba6 100644 --- a/test/DebugInfo/COFF/cpp-mangling.ll +++ b/test/DebugInfo/COFF/cpp-mangling.ll @@ -12,12 +12,12 @@ ; fn_tmpl<int, foo::bar>(); ; } -; CHECK: ProcStart { +; CHECK: {{.*}}Proc{{.*}}Sym { ; CHECK: FunctionType: bar ({{.*}}) ; CHECK: DisplayName: foo::bar{{$}} ; CHECK-NEXT: LinkageName: ?bar@foo@@YAHH@Z -; CHECK: ProcStart { +; CHECK: {{.*}}Proc{{.*}}Sym { ; CHECK: FunctionType: fn_tmpl ({{.*}}) ; CHECK: DisplayName: foo::fn_tmpl<int,&foo::bar> ; CHECK-NEXT: LinkageName: ??$fn_tmpl@H$1?bar@foo@@YAHH@Z@foo@@YAXXZ diff --git a/test/DebugInfo/COFF/fp-stack.ll b/test/DebugInfo/COFF/fp-stack.ll index 4a30a49a3768e..8061e2ee23d1a 100644 --- a/test/DebugInfo/COFF/fp-stack.ll +++ b/test/DebugInfo/COFF/fp-stack.ll @@ -11,7 +11,7 @@ entry: } ; ASM: .cv_def_range Lfunc_begin0 Lfunc_end0, "A\021\200\000\000\000" -; OBJ: DefRangeRegister { +; OBJ: DefRangeRegisterSym { ; OBJ: Register: 128 ; OBJ: MayHaveNoName: 0 ; OBJ: LocalVariableAddrRange { diff --git a/test/DebugInfo/COFF/globals.ll b/test/DebugInfo/COFF/globals.ll index 0d1b9413e3d84..f5d6906e181e6 100644 --- a/test/DebugInfo/COFF/globals.ll +++ b/test/DebugInfo/COFF/globals.ll @@ -81,13 +81,13 @@ ; OBJ: DisplayName: first ; OBJ: LinkageName: ?first@@3HA ; OBJ: } -; OBJ: ThreadLocalDataSym { +; OBJ: GlobalTLS { ; OBJ: DataOffset: ?middle@@3PEBHEB+0x0 ; OBJ: Type: const int* (0x1001) ; OBJ: DisplayName: middle ; OBJ: LinkageName: ?middle@@3PEBHEB ; OBJ: } -; OBJ: DataSym { +; OBJ: GlobalData { ; OBJ: Kind: S_GDATA32 (0x110D) ; OBJ: DataOffset: ?last@@3HA+0x0 ; OBJ: Type: int (0x74) @@ -101,7 +101,7 @@ ; OBJ: Magic: 0x4 ; OBJ: Subsection [ ; OBJ: SubSectionType: Symbols (0xF1) -; OBJ: DataSym { +; OBJ: GlobalData { ; OBJ: DataOffset: ?comdat@?$A@X@@2HB+0x0 ; OBJ: Type: const int (0x1000) ; OBJ: DisplayName: comdat diff --git a/test/DebugInfo/COFF/inlining-files.ll b/test/DebugInfo/COFF/inlining-files.ll index a6f5d281eb097..e3e616b618da5 100644 --- a/test/DebugInfo/COFF/inlining-files.ll +++ b/test/DebugInfo/COFF/inlining-files.ll @@ -18,10 +18,10 @@ ; OBJ: Subsection [ ; OBJ: SubSectionType: Symbols (0xF1) -; OBJ: ProcStart { +; OBJ: {{.*}}Proc{{.*}}Sym { ; OBJ: DisplayName: f ; OBJ: } -; OBJ: InlineSite { +; OBJ: InlineSiteSym { ; OBJ: PtrParent: 0x0 ; OBJ: PtrEnd: 0x0 ; OBJ: Inlinee: file_change (0x1002) diff --git a/test/DebugInfo/COFF/inlining-header.ll b/test/DebugInfo/COFF/inlining-header.ll index 0981825e0d3b9..7e19f14716f0b 100644 --- a/test/DebugInfo/COFF/inlining-header.ll +++ b/test/DebugInfo/COFF/inlining-header.ll @@ -63,7 +63,7 @@ ; OBJ: Subsection [ ; OBJ: SubSectionType: Symbols (0xF1) -; OBJ: ProcStart { +; OBJ: {{.*}}Proc{{.*}}Sym { ; OBJ: Kind: S_GPROC32_ID (0x1147) ; OBJ: FunctionType: main (0x1005) ; OBJ: CodeOffset: _main+0x0 @@ -74,8 +74,8 @@ ; OBJ: LinkageName: _main ; OBJ: } -; Previously, g's InlineSite referenced t.h, which was wasteful. -; OBJ: InlineSite { +; Previously, g's InlineSiteSym referenced t.h, which was wasteful. +; OBJ: InlineSiteSym { ; OBJ: Inlinee: g (0x1002) ; OBJ: BinaryAnnotations [ ; OBJ-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0x6, LineOffset: 1} @@ -85,7 +85,7 @@ ; OBJ-NEXT: ] ; OBJ: } -; OBJ: InlineSite { +; OBJ: InlineSiteSym { ; OBJ: Inlinee: f (0x1003) ; OBJ: BinaryAnnotations [ ; OBJ-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0xD, LineOffset: 1} diff --git a/test/DebugInfo/COFF/inlining-levels.ll b/test/DebugInfo/COFF/inlining-levels.ll index 0c5c73c8fdbe5..7f93dbb850a2f 100644 --- a/test/DebugInfo/COFF/inlining-levels.ll +++ b/test/DebugInfo/COFF/inlining-levels.ll @@ -18,14 +18,14 @@ ; OBJ: Subsection [ ; OBJ: SubSectionType: Symbols (0xF1) -; OBJ: ProcStart { -; OBJ: InlineSite { +; OBJ: {{.*}}Proc{{.*}}Sym { +; OBJ: InlineSiteSym { ; OBJ: Inlinee: h (0x1002) ; OBJ: } -; OBJ: InlineSite { +; OBJ: InlineSiteSym { ; OBJ: Inlinee: g (0x1003) ; OBJ: } -; OBJ: InlineSite { +; OBJ: InlineSiteSym { ; OBJ: Inlinee: f (0x1004) ; OBJ: } ; OBJ: InlineSiteEnd { diff --git a/test/DebugInfo/COFF/inlining-same-name.ll b/test/DebugInfo/COFF/inlining-same-name.ll index 4a9c9924135d2..3700b7060a7a9 100644 --- a/test/DebugInfo/COFF/inlining-same-name.ll +++ b/test/DebugInfo/COFF/inlining-same-name.ll @@ -14,15 +14,15 @@ ; CHECK: CodeViewDebugInfo [ ; CHECK: Section: .debug$S ; CHECK: Subsection [ -; CHECK: ProcStart { +; CHECK: {{.*}}Proc{{.*}}Sym { ; CHECK: DisplayName: main ; CHECK: } -; CHECK: InlineSite { +; CHECK: InlineSiteSym { ; CHECK: Inlinee: same_name (0x1002) ; CHECK: } ; CHECK: InlineSiteEnd { ; CHECK: } -; CHECK: InlineSite { +; CHECK: InlineSiteSym { ; CHECK: Inlinee: same_name (0x1002) ; CHECK: } ; CHECK: InlineSiteEnd { diff --git a/test/DebugInfo/COFF/inlining.ll b/test/DebugInfo/COFF/inlining.ll index 76b8f8c88ee2d..ddfd5e056a1b9 100644 --- a/test/DebugInfo/COFF/inlining.ll +++ b/test/DebugInfo/COFF/inlining.ll @@ -166,7 +166,7 @@ ; OBJ: ] ; OBJ: Subsection [ ; OBJ: SubSectionType: Symbols (0xF1) -; OBJ: ProcStart { +; OBJ: {{.*}}Proc{{.*}}Sym { ; OBJ: PtrParent: 0x0 ; OBJ: PtrEnd: 0x0 ; OBJ: PtrNext: 0x0 @@ -181,7 +181,7 @@ ; OBJ: DisplayName: baz ; OBJ: LinkageName: ?baz@@YAXXZ ; OBJ: } -; OBJ: InlineSite { +; OBJ: InlineSiteSym { ; OBJ: PtrParent: 0x0 ; OBJ: PtrEnd: 0x0 ; OBJ: Inlinee: bar (0x1002) @@ -193,7 +193,7 @@ ; OBJ-NEXT: ChangeCodeLength: 0x7 ; OBJ: ] ; OBJ: } -; OBJ: InlineSite { +; OBJ: InlineSiteSym { ; OBJ: PtrParent: 0x0 ; OBJ: PtrEnd: 0x0 ; OBJ: Inlinee: foo (0x1003) diff --git a/test/DebugInfo/COFF/int8-char-type.ll b/test/DebugInfo/COFF/int8-char-type.ll index 82972a4528196..2e4395b4a599d 100644 --- a/test/DebugInfo/COFF/int8-char-type.ll +++ b/test/DebugInfo/COFF/int8-char-type.ll @@ -5,7 +5,7 @@ ; DW_ATE_[un]signed encoding for all integer types if they don't have distinct ; integer types for characters types. This was PR30552. -; CHECK-LABEL: DataSym { +; CHECK-LABEL: GlobalData { ; CHECK-NEXT: Kind: S_GDATA32 (0x110D) ; CHECK-NEXT: DataOffset: ; CHECK-NEXT: Type: signed char (0x10) @@ -13,7 +13,7 @@ ; CHECK-NEXT: LinkageName: x ; CHECK-NEXT: } -; CHECK-LABEL: DataSym { +; CHECK-LABEL: GlobalData { ; CHECK-NEXT: Kind: S_GDATA32 (0x110D) ; CHECK-NEXT: DataOffset: ; CHECK-NEXT: Type: unsigned char (0x20) diff --git a/test/DebugInfo/COFF/local-constant.ll b/test/DebugInfo/COFF/local-constant.ll index bf8ba8446a6d9..c99dd32e22e48 100644 --- a/test/DebugInfo/COFF/local-constant.ll +++ b/test/DebugInfo/COFF/local-constant.ll @@ -11,10 +11,11 @@ ; FIXME: Find a way to describe variables optimized to constants. -; OBJ: ProcStart { +; OBJ: {{.*}}Proc{{.*}}Sym { ; OBJ: DisplayName: constant_var ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { +; OBJ-NEXT: Kind: ; OBJ-NEXT: Type: int (0x74) ; OBJ-NEXT: Flags [ (0x100) ; OBJ-NEXT: IsOptimizedOut (0x100) diff --git a/test/DebugInfo/COFF/local-variable-gap.ll b/test/DebugInfo/COFF/local-variable-gap.ll index a2d05eaa03e41..ab38bbd8c13f8 100644 --- a/test/DebugInfo/COFF/local-variable-gap.ll +++ b/test/DebugInfo/COFF/local-variable-gap.ll @@ -66,12 +66,13 @@ ; ASM: .short 2 # Record length ; ASM: .short 4431 # Record kind: S_PROC_ID_END -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: Type: int (0x74) ; OBJ: VarName: p ; OBJ: } -; OBJ-NOT: Local { -; OBJ: DefRangeRegister { +; OBJ-NOT: LocalSym { +; OBJ: DefRangeRegisterSym { +; OBJ-NEXT: Kind: ; OBJ-NEXT: Register: 23 ; OBJ-NEXT: MayHaveNoName: 0 ; OBJ-NEXT: LocalVariableAddrRange { diff --git a/test/DebugInfo/COFF/local-variables.ll b/test/DebugInfo/COFF/local-variables.ll index 249b6e1103dba..f7087f76f4c1c 100644 --- a/test/DebugInfo/COFF/local-variables.ll +++ b/test/DebugInfo/COFF/local-variables.ll @@ -99,18 +99,18 @@ ; OBJ: Subsection [ ; OBJ: SubSectionType: Symbols (0xF1) -; OBJ: ProcStart { +; OBJ: {{.*}}Proc{{.*}}Sym { ; OBJ: DisplayName: f ; OBJ: LinkageName: f ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: Type: int (0x74) ; OBJ: Flags [ (0x1) ; OBJ: IsParameter (0x1) ; OBJ: ] ; OBJ: VarName: param ; OBJ: } -; OBJ: DefRangeRegisterRel { +; OBJ: DefRangeRegisterRelSym { ; OBJ: BaseRegister: 335 ; OBJ: HasSpilledUDTMember: No ; OBJ: OffsetInParent: 0 @@ -121,13 +121,13 @@ ; OBJ: Range: 0x4F ; OBJ: } ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: Type: int (0x74) ; OBJ: Flags [ (0x0) ; OBJ: ] ; OBJ: VarName: a ; OBJ: } -; OBJ: DefRangeRegisterRel { +; OBJ: DefRangeRegisterRelSym { ; OBJ: BaseRegister: 335 ; OBJ: HasSpilledUDTMember: No ; OBJ: OffsetInParent: 0 @@ -138,13 +138,13 @@ ; OBJ: Range: 0x21 ; OBJ: } ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: Type: int (0x74) ; OBJ: Flags [ (0x0) ; OBJ: ] ; OBJ: VarName: b ; OBJ: } -; OBJ: DefRangeRegisterRel { +; OBJ: DefRangeRegisterRelSym { ; OBJ: BaseRegister: 335 ; OBJ: HasSpilledUDTMember: No ; OBJ: OffsetInParent: 0 @@ -155,7 +155,7 @@ ; OBJ: Range: 0x1F ; OBJ: } ; OBJ: } -; OBJ: InlineSite { +; OBJ: InlineSiteSym { ; OBJ: PtrParent: 0x0 ; OBJ: PtrEnd: 0x0 ; OBJ: Inlinee: will_be_inlined (0x1002) @@ -166,13 +166,13 @@ ; OBJ: ChangeCodeLength: 0xC ; OBJ: ] ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: Type: int (0x74) ; OBJ: Flags [ (0x0) ; OBJ: ] ; OBJ: VarName: v ; OBJ: } -; OBJ: DefRangeRegisterRel { +; OBJ: DefRangeRegisterRelSym { ; OBJ: BaseRegister: 335 ; OBJ: HasSpilledUDTMember: No ; OBJ: OffsetInParent: 0 @@ -185,7 +185,7 @@ ; OBJ: } ; OBJ: InlineSiteEnd { ; OBJ: } -; OBJ: InlineSite { +; OBJ: InlineSiteSym { ; OBJ: PtrParent: 0x0 ; OBJ: PtrEnd: 0x0 ; OBJ: Inlinee: will_be_inlined (0x1002) @@ -196,13 +196,13 @@ ; OBJ: ChangeCodeLength: 0xA ; OBJ: ] ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: Type: int (0x74) ; OBJ: Flags [ (0x0) ; OBJ: ] ; OBJ: VarName: v ; OBJ: } -; OBJ: DefRangeRegisterRel { +; OBJ: DefRangeRegisterRelSym { ; OBJ: BaseRegister: 335 ; OBJ: HasSpilledUDTMember: No ; OBJ: OffsetInParent: 0 diff --git a/test/DebugInfo/COFF/long-name.ll b/test/DebugInfo/COFF/long-name.ll index 998d77f7ca06d..65bd4c16f7508 100644 --- a/test/DebugInfo/COFF/long-name.ll +++ b/test/DebugInfo/COFF/long-name.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -filetype=obj | llvm-readobj -codeview | FileCheck %s -; CHECK: ProcStart { +; CHECK: {{.*}}Proc{{.*}}Sym { ; CHECK: Kind: S_GPROC32_ID (0x1147) ; CHECK: FunctionType: {{A+}} (0x1002) ; CHECK: CodeOffset: f+0x0 diff --git a/test/DebugInfo/COFF/multifile.ll b/test/DebugInfo/COFF/multifile.ll index 5e53fa57acc47..8af99a6063e66 100644 --- a/test/DebugInfo/COFF/multifile.ll +++ b/test/DebugInfo/COFF/multifile.ll @@ -43,13 +43,13 @@ ; OBJ32: ] ; OBJ32: Subsection [ ; OBJ32-NEXT: SubSectionType: Symbols (0xF1) -; OBJ32: ProcStart { +; OBJ32: {{.*}}Proc{{.*}}Sym { ; OBJ32: CodeSize: 0x10 ; OBJ32: DisplayName: f ; OBJ32: LinkageName: _f ; OBJ32: } ; OBJ32-NEXT: ProcEnd { -; OBJ32-NEXT: } +; OBJ32: } ; OBJ32-NEXT: ] ; OBJ32: FunctionLineTable [ ; OBJ32-NEXT: Name: _f @@ -115,13 +115,13 @@ ; OBJ64: ] ; OBJ64: Subsection [ ; OBJ64-NEXT: SubSectionType: Symbols (0xF1) -; OBJ64: ProcStart { +; OBJ64: {{.*}}Proc{{.*}}Sym { ; OBJ64: CodeSize: 0x18 ; OBJ64: DisplayName: f ; OBJ64: LinkageName: f ; OBJ64: } ; OBJ64-NEXT: ProcEnd { -; OBJ64-NEXT: } +; OBJ64: } ; OBJ64-NEXT: ] ; OBJ64: FunctionLineTable [ ; OBJ64-NEXT: Name: f diff --git a/test/DebugInfo/COFF/multifunction.ll b/test/DebugInfo/COFF/multifunction.ll index a6290e8f021da..87db2a20eaa6c 100644 --- a/test/DebugInfo/COFF/multifunction.ll +++ b/test/DebugInfo/COFF/multifunction.ll @@ -145,7 +145,7 @@ ; OBJ32: ] ; OBJ32: Subsection [ ; OBJ32-NEXT: SubSectionType: Symbols (0xF1) -; OBJ32: ProcStart { +; OBJ32: {{.*}}Proc{{.*}}Sym { ; OBJ32: Kind: S_LPROC32_ID (0x1146) ; OBJ32: CodeSize: 0x6 ; OBJ32: DisplayName: x @@ -159,7 +159,7 @@ ; OBJ32: ] ; OBJ32: Subsection [ ; OBJ32-NEXT: SubSectionType: Symbols (0xF1) -; OBJ32: ProcStart { +; OBJ32: {{.*}}Proc{{.*}}Sym { ; OBJ32: Kind: S_GPROC32_ID (0x1147) ; OBJ32: CodeSize: 0x6 ; OBJ32: DisplayName: y @@ -173,7 +173,7 @@ ; OBJ32: ] ; OBJ32: Subsection [ ; OBJ32-NEXT: SubSectionType: Symbols (0xF1) -; OBJ32: ProcStart { +; OBJ32: {{.*}}Proc{{.*}}Sym { ; OBJ32: Kind: S_GPROC32_ID (0x1147) ; OBJ32: CodeSize: 0x10 ; OBJ32: DisplayName: f @@ -419,7 +419,7 @@ ; OBJ64-NEXT: ] ; OBJ64: Subsection [ ; OBJ64-NEXT: SubSectionType: Symbols (0xF1) -; OBJ64: ProcStart { +; OBJ64: {{.*}}Proc{{.*}}Sym { ; OBJ64: Kind: S_LPROC32_ID (0x1146) ; OBJ64: CodeSize: 0xE ; OBJ64: DisplayName: x @@ -433,7 +433,7 @@ ; OBJ64: ] ; OBJ64: Subsection [ ; OBJ64-NEXT: SubSectionType: Symbols (0xF1) -; OBJ64: ProcStart { +; OBJ64: {{.*}}Proc{{.*}}Sym { ; OBJ64: Kind: S_GPROC32_ID (0x1147) ; OBJ64: CodeSize: 0xE ; OBJ64: DisplayName: y @@ -447,7 +447,7 @@ ; OBJ64: ] ; OBJ64: Subsection [ ; OBJ64-NEXT: SubSectionType: Symbols (0xF1) -; OBJ64: ProcStart { +; OBJ64: {{.*}}Proc{{.*}}Sym { ; OBJ64: Kind: S_GPROC32_ID (0x1147) ; OBJ64: CodeSize: 0x18 ; OBJ64: DisplayName: f diff --git a/test/DebugInfo/COFF/pieces.ll b/test/DebugInfo/COFF/pieces.ll index 60330e0577267..098f2ae62f0b1 100644 --- a/test/DebugInfo/COFF/pieces.ll +++ b/test/DebugInfo/COFF/pieces.ll @@ -105,21 +105,21 @@ ; ASM: .cv_def_range [[oy_start]] [[oy_end]], "C\021\027\000\000\000\004\000\000\000" -; OBJ-LABEL: ProcStart { +; OBJ-LABEL: {{.*}}Proc{{.*}}Sym { ; OBJ: Kind: S_GPROC32_ID (0x1147) ; OBJ: DisplayName: loop_csr ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: VarName: o ; OBJ: } -; OBJ: DefRangeSubfieldRegister { +; OBJ: DefRangeSubfieldRegisterSym { ; OBJ: Register: 24 ; OBJ: MayHaveNoName: 0 ; OBJ: OffsetInParent: 0 ; OBJ: LocalVariableAddrRange { ; OBJ: } ; OBJ: } -; OBJ: DefRangeSubfieldRegister { +; OBJ: DefRangeSubfieldRegisterSym { ; OBJ: Register: 23 ; OBJ: MayHaveNoName: 0 ; OBJ: OffsetInParent: 4 @@ -135,14 +135,14 @@ ; ASM: .asciz "o" ; ASM: .cv_def_range .Lfunc_begin1 .Lfunc_end1, "C\021\022\000\000\000\004\000\000\000" -; OBJ-LABEL: ProcStart { +; OBJ-LABEL: {{.*}}Proc{{.*}}Sym { ; OBJ: Kind: S_GPROC32_ID (0x1147) ; OBJ: DisplayName: pad_right ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: VarName: o ; OBJ: } -; OBJ: DefRangeSubfieldRegister { +; OBJ: DefRangeSubfieldRegisterSym { ; OBJ: Register: 18 ; OBJ: MayHaveNoName: 0 ; OBJ: OffsetInParent: 4 @@ -158,14 +158,14 @@ ; ASM: .asciz "o" ; ASM: .cv_def_range .Lfunc_begin2 .Lfunc_end2, "C\021\022\000\000\000\000\000\000\000" -; OBJ-LABEL: ProcStart { +; OBJ-LABEL: {{.*}}Proc{{.*}}Sym { ; OBJ: Kind: S_GPROC32_ID (0x1147) ; OBJ: DisplayName: pad_left ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: VarName: o ; OBJ: } -; OBJ: DefRangeSubfieldRegister { +; OBJ: DefRangeSubfieldRegisterSym { ; OBJ: Register: 18 ; OBJ: MayHaveNoName: 0 ; OBJ: OffsetInParent: 0 @@ -185,17 +185,17 @@ ; ASM: .asciz "p" ; ASM: .cv_def_range [[p_start]] .Lfunc_end3, "C\021\021\000\000\000\004\000\000\000" -; OBJ-LABEL: ProcStart { +; OBJ-LABEL: {{.*}}Proc{{.*}}Sym { ; OBJ: Kind: S_GPROC32_ID (0x1147) ; OBJ: DisplayName: nested ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: VarName: o ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: VarName: p ; OBJ: } -; OBJ: DefRangeSubfieldRegister { +; OBJ: DefRangeSubfieldRegisterSym { ; OBJ: Register: 17 ; OBJ: MayHaveNoName: 0 ; OBJ: OffsetInParent: 4 @@ -212,14 +212,14 @@ ; ASM: .asciz "o" ; ASM: .cv_def_range [[spill_o_x_start]] [[spill_o_x_end]], "E\021O\001A\000$\000\000\000" -; OBJ-LABEL: ProcStart { +; OBJ-LABEL: {{.*}}Proc{{.*}}Sym { ; OBJ: Kind: S_GPROC32_ID (0x1147) ; OBJ: DisplayName: bitpiece_spill ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: VarName: o ; OBJ: } -; OBJ: DefRangeRegisterRel { +; OBJ: DefRangeRegisterRelSym { ; OBJ: BaseRegister: 335 ; OBJ: HasSpilledUDTMember: Yes ; OBJ: OffsetInParent: 4 diff --git a/test/DebugInfo/COFF/register-variables.ll b/test/DebugInfo/COFF/register-variables.ll index d0ca5ca2afadd..f8cd5c4fc3c19 100644 --- a/test/DebugInfo/COFF/register-variables.ll +++ b/test/DebugInfo/COFF/register-variables.ll @@ -81,17 +81,17 @@ ; OBJ: Subsection [ ; OBJ: SubSectionType: Symbols (0xF1) -; OBJ: ProcStart { +; OBJ: {{.*}}Proc{{.*}}Sym { ; OBJ: DisplayName: f ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: Type: int (0x74) ; OBJ: Flags [ (0x1) ; OBJ: IsParameter (0x1) ; OBJ: ] ; OBJ: VarName: p ; OBJ: } -; OBJ: DefRangeRegister { +; OBJ: DefRangeRegisterSym { ; OBJ: Register: 18 ; OBJ: LocalVariableAddrRange { ; OBJ: OffsetStart: .text+0x0 @@ -99,7 +99,7 @@ ; OBJ: Range: 0x7 ; OBJ: } ; OBJ: } -; OBJ: DefRangeRegister { +; OBJ: DefRangeRegisterSym { ; OBJ: Register: 23 ; OBJ: LocalVariableAddrRange { ; OBJ: OffsetStart: .text+0x7 @@ -107,13 +107,13 @@ ; OBJ: Range: 0x18 ; OBJ: } ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: Type: int (0x74) ; OBJ: Flags [ (0x0) ; OBJ: ] ; OBJ: VarName: a ; OBJ: } -; OBJ: DefRangeRegister { +; OBJ: DefRangeRegisterSym { ; OBJ: Register: 17 ; OBJ: LocalVariableAddrRange { ; OBJ: OffsetStart: .text+0xC @@ -121,13 +121,13 @@ ; OBJ: Range: 0x6 ; OBJ: } ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: Type: int (0x74) ; OBJ: Flags [ (0x0) ; OBJ: ] ; OBJ: VarName: c ; OBJ: } -; OBJ: DefRangeRegister { +; OBJ: DefRangeRegisterSym { ; OBJ: Register: 17 ; OBJ: LocalVariableAddrRange { ; OBJ: OffsetStart: .text+0xC @@ -135,13 +135,13 @@ ; OBJ: Range: 0x4 ; OBJ: } ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: Type: int (0x74) ; OBJ: Flags [ (0x0) ; OBJ: ] ; OBJ: VarName: b ; OBJ: } -; OBJ: DefRangeRegister { +; OBJ: DefRangeRegisterSym { ; OBJ: Register: 17 ; OBJ: MayHaveNoName: 0 ; OBJ: OffsetStart: .text+0x12 @@ -149,19 +149,19 @@ ; OBJ: Range: 0x6 ; OBJ: } ; OBJ: } -; OBJ: InlineSite { +; OBJ: InlineSiteSym { ; OBJ: PtrParent: 0x0 ; OBJ: PtrEnd: 0x0 ; OBJ: Inlinee: inlineinc (0x1002) ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: Type: int (0x74) ; OBJ: Flags [ (0x1) ; OBJ: IsParameter (0x1) ; OBJ: ] ; OBJ: VarName: a ; OBJ: } -; OBJ: DefRangeRegister { +; OBJ: DefRangeRegisterSym { ; OBJ: Register: 17 ; OBJ: LocalVariableAddrRange { ; OBJ: OffsetStart: .text+0xC @@ -169,13 +169,13 @@ ; OBJ: Range: 0x6 ; OBJ: } ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: Type: int (0x74) ; OBJ: Flags [ (0x0) ; OBJ: ] ; OBJ: VarName: b ; OBJ: } -; OBJ: DefRangeRegister { +; OBJ: DefRangeRegisterSym { ; OBJ: Register: 17 ; OBJ: LocalVariableAddrRange { ; OBJ: OffsetStart: .text+0x12 diff --git a/test/DebugInfo/COFF/simple.ll b/test/DebugInfo/COFF/simple.ll index 3a0b1c9fa7cd6..50d121be6942c 100644 --- a/test/DebugInfo/COFF/simple.ll +++ b/test/DebugInfo/COFF/simple.ll @@ -77,13 +77,13 @@ ; OBJ32-NEXT: ] ; OBJ32: Subsection [ ; OBJ32-NEXT: SubSectionType: Symbols (0xF1) -; OBJ32: ProcStart { +; OBJ32: {{.*}}Proc{{.*}}Sym { ; OBJ32: CodeSize: 0x6 ; OBJ32: DisplayName: f ; OBJ32: LinkageName: _f ; OBJ32: } ; OBJ32-NEXT: ProcEnd { -; OBJ32-NEXT: } +; OBJ32: } ; OBJ32-NEXT: ] ; OBJ32: FunctionLineTable [ ; OBJ32-NEXT: Name: _f @@ -174,13 +174,13 @@ ; OBJ64-NEXT: ] ; OBJ64: Subsection [ ; OBJ64-NEXT: SubSectionType: Symbols (0xF1) -; OBJ64: ProcStart { +; OBJ64: {{.*}}Proc{{.*}}Sym { ; OBJ64: CodeSize: 0xE ; OBJ64: DisplayName: f ; OBJ64: LinkageName: f ; OBJ64: } ; OBJ64-NEXT: ProcEnd { -; OBJ64-NEXT: } +; OBJ64: } ; OBJ64-NEXT: ] ; OBJ64: FunctionLineTable [ ; OBJ64-NEXT: Name: f diff --git a/test/DebugInfo/COFF/typedef.ll b/test/DebugInfo/COFF/typedef.ll index cf4e3df257de6..9d841419c561a 100644 --- a/test/DebugInfo/COFF/typedef.ll +++ b/test/DebugInfo/COFF/typedef.ll @@ -2,7 +2,7 @@ ; CHECK: CodeViewDebugInfo [ ; CHECK: Subsection [ -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: wchar_t (0x71) ; CHECK: Flags [ (0x0) ; CHECK: ] @@ -10,7 +10,7 @@ ; CHECK: } ; CHECK: Subsection [ ; CHECK: SubSectionType: Symbols (0xF1) -; CHECK: UDT { +; CHECK: UDTSym { ; CHECK: Type: wchar_t (0x71) ; CHECK: UDTName: XYZ ; CHECK: } diff --git a/test/DebugInfo/COFF/types-array.ll b/test/DebugInfo/COFF/types-array.ll index dca3884b1d099..1a4afa8bd2195 100644 --- a/test/DebugInfo/COFF/types-array.ll +++ b/test/DebugInfo/COFF/types-array.ll @@ -46,7 +46,7 @@ ; CHECK: Magic: 0x4 ; CHECK: Subsection [ ; CHECK: SubSectionType: Symbols (0xF1) -; CHECK: ProcStart { +; CHECK: {{.*}}Proc{{.*}}Sym { ; CHECK: PtrParent: 0x0 ; CHECK: PtrEnd: 0x0 ; CHECK: PtrNext: 0x0 @@ -61,13 +61,13 @@ ; CHECK: DisplayName: f ; CHECK: LinkageName: ?f@@YAXXZ ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: 0x1003 ; CHECK: Flags [ (0x0) ; CHECK: ] ; CHECK: VarName: a ; CHECK: } -; CHECK: DefRangeRegisterRel { +; CHECK: DefRangeRegisterRelSym { ; CHECK: BaseRegister: 22 ; CHECK: HasSpilledUDTMember: No ; CHECK: OffsetInParent: 0 diff --git a/test/DebugInfo/COFF/types-basic.ll b/test/DebugInfo/COFF/types-basic.ll index 4ead4bfc1c4cf..4b9fcd864c276 100644 --- a/test/DebugInfo/COFF/types-basic.ll +++ b/test/DebugInfo/COFF/types-basic.ll @@ -218,7 +218,7 @@ ; CHECK: CodeViewDebugInfo [ ; CHECK: Subsection [ ; CHECK: SubSectionType: Symbols (0xF1) -; CHECK: ProcStart { +; CHECK: {{.*}}Proc{{.*}}Sym { ; CHECK: DbgStart: 0x0 ; CHECK: DbgEnd: 0x0 ; CHECK: FunctionType: f (0x1002) @@ -229,68 +229,68 @@ ; CHECK: DisplayName: f ; CHECK: LinkageName: ?f@@YAXMN_J@Z ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: float (0x40) ; CHECK: Flags [ (0x1) ; CHECK: IsParameter (0x1) ; CHECK: ] ; CHECK: VarName: p1 ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: double (0x41) ; CHECK: Flags [ (0x1) ; CHECK: IsParameter (0x1) ; CHECK: ] ; CHECK: VarName: p2 ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: __int64 (0x13) ; CHECK: Flags [ (0x1) ; CHECK: IsParameter (0x1) ; CHECK: ] ; CHECK: VarName: p3 ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: int (0x74) ; CHECK: VarName: v1 ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: int* (0x674) ; CHECK: VarName: v2 ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: const int* (0x1004) ; CHECK: VarName: v21 ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: void* (0x603) ; CHECK: VarName: v3 ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: int A::* (0x1006) ; CHECK: VarName: v4 ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: void A::() A::* (0x100E) ; CHECK: VarName: v5 ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: long (0x12) ; CHECK: VarName: l1 ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: long (0x12) ; CHECK: VarName: l2 ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: unsigned long (0x22) ; CHECK: VarName: l3 ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: unsigned long (0x22) ; CHECK: VarName: l4 ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: const void* (0x1010) ; CHECK: VarName: v6 ; CHECK: } @@ -298,48 +298,48 @@ ; CHECK: } ; CHECK: ] ; CHECK: Subsection [ -; CHECK: ProcStart { +; CHECK: {{.*}}Proc{{.*}}Sym { ; CHECK: Type: CharTypes (0x1012) ; CHECK: DisplayName: CharTypes ; CHECK: LinkageName: ?CharTypes@@YAXXZ ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: wchar_t (0x71) ; CHECK: Flags [ (0x0) ; CHECK: ] ; CHECK: VarName: w ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: unsigned short (0x21) ; CHECK: Flags [ (0x0) ; CHECK: ] ; CHECK: VarName: us ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: char (0x70) ; CHECK: Flags [ (0x0) ; CHECK: ] ; CHECK: VarName: c ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: unsigned char (0x20) ; CHECK: Flags [ (0x0) ; CHECK: ] ; CHECK: VarName: uc ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: signed char (0x10) ; CHECK: Flags [ (0x0) ; CHECK: ] ; CHECK: VarName: sc ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: char16_t (0x7A) ; CHECK: Flags [ (0x0) ; CHECK: ] ; CHECK: VarName: c16 ; CHECK: } -; CHECK: Local { +; CHECK: LocalSym { ; CHECK: Type: char32_t (0x7B) ; CHECK: Flags [ (0x0) ; CHECK: ] diff --git a/test/DebugInfo/COFF/udts.ll b/test/DebugInfo/COFF/udts.ll index abc688d70a61a..735901f7571cb 100644 --- a/test/DebugInfo/COFF/udts.ll +++ b/test/DebugInfo/COFF/udts.ll @@ -18,37 +18,39 @@ target triple = "i686-pc-windows-msvc18.0.0" ; typedef struct { int x; } U; ; U u; -; CHECK: ProcStart { +; CHECK: {{.*}}Proc{{.*}}Sym { ; CHECK: DisplayName: f ; CHECK: LinkageName: ?f@@YAXXZ ; CHECK: } -; CHECK: UDT { +; CHECK: UDTSym { +; CHECK-NEXT: Kind: S_UDT (0x1108) ; CHECK-NEXT: Type: int (0x74) ; CHECK-NEXT: UDTName: f::FOO ; CHECK-NEXT: } ; CHECK-NEXT: ProcEnd { -; CHECK-NEXT: } -; CHECK: ProcStart { +; CHECK: {{.*}}Proc{{.*}}Sym { ; CHECK: DisplayName: g ; CHECK: LinkageName: ?g@@YAMPEAUS@@@Z ; CHECK: } -; CHECK: UDT { +; CHECK: UDTSym { +; CHECK-NEXT: Kind: S_UDT (0x1108) ; CHECK-NEXT: Type: g::pun (0x{{[0-9A-F]+}}) ; CHECK-NEXT: UDTName: g::pun ; CHECK-NEXT: } ; CHECK-NEXT: ProcEnd { -; CHECK-NEXT: } ; CHECK: Subsection -; CHECK-NOT: ProcStart -; CHECK: UDT { +; CHECK-NOT: {{.*}}Proc{{.*}}Sym +; CHECK: UDTSym { +; CHECK-NEXT: Kind: S_UDT (0x1108) ; CHECK-NEXT: Type: S (0x{{[0-9A-F]+}}) ; CHECK-NEXT: UDTName: S -; CHECK: UDT { +; CHECK: UDTSym { +; CHECK-NEXT: Kind: S_UDT (0x1108) ; CHECK-NEXT: Type: <unnamed-tag> (0x{{[0-9A-F]+}}) ; CHECK-NEXT: UDTName: U -; CHECK-NOT: UDT { +; CHECK-NOT: UDTSym { %struct.U = type { i32 } %struct.S = type { i32 } diff --git a/test/DebugInfo/Inputs/dwarfdump-str-offsets-macho.o b/test/DebugInfo/Inputs/dwarfdump-str-offsets-macho.o Binary files differnew file mode 100644 index 0000000000000..c0ed489d846c7 --- /dev/null +++ b/test/DebugInfo/Inputs/dwarfdump-str-offsets-macho.o diff --git a/test/DebugInfo/Inputs/dwarfdump-str-offsets-macho.s b/test/DebugInfo/Inputs/dwarfdump-str-offsets-macho.s new file mode 100644 index 0000000000000..9ee9ad234d84c --- /dev/null +++ b/test/DebugInfo/Inputs/dwarfdump-str-offsets-macho.s @@ -0,0 +1,201 @@ +# Test object to verify dwarfdump handles v5 string offset tables in Mach-O. +# This is similar to dwarfdump-str-offsets.s with 2 CUs and 1 TU, but no +# split sections. +# +# To generate the test object: +# llvm-mc -triple i386-apple-darwin9 dwarfdump-str-offsets-macho.s -filetype=obj \ +# -o dwarfdump-str-offsets-macho.o + + .section __DWARF,__debug_str,regular,debug +Linfo_string: + .asciz "Handmade DWARF producer" +str_CU1: + .asciz "Compile_Unit_1" +str_CU1_dir: + .asciz "/home/test/CU1" +str_CU2: + .asciz "Compile_Unit_2" +str_CU2_dir: + .asciz "/home/test/CU2" +str_TU: + .asciz "Type_Unit" +str_TU_type: + .asciz "MyStruct" +str_Subprogram: + .asciz "MyFunc" +str_Variable1: + .asciz "MyVar1" +str_Variable2: + .asciz "MyVar2" +str_Variable3: + .asciz "MyVar3" + + .section __DWARF,__debug_str_offs,regular,debug +Ldebug_str_offsets: + .long Ldebug_str_offsets_segment0_end-Ldebug_str_offsets_base0 + .short 5 # DWARF version + .short 0 # Padding +Ldebug_str_offsets_base0: + .long str_producer + .long str_CU1 + .long str_CU1_dir + .long str_Subprogram + .long str_Variable1 + .long str_Variable2 + .long str_Variable3 +Ldebug_str_offsets_segment0_end: +# CU2's contribution + .long Ldebug_str_offsets_segment1_end-Ldebug_str_offsets_base1 + .short 5 # DWARF version + .short 0 # Padding +Ldebug_str_offsets_base1: + .long str_producer + .long str_CU2 + .long str_CU2_dir +Ldebug_str_offsets_segment1_end: +# The TU's contribution + .long Ldebug_str_offsets_segment2_end-Ldebug_str_offsets_base2 + .short 5 # DWARF version + .short 0 # Padding +Ldebug_str_offsets_base2: + .long str_TU + .long str_TU_type +Ldebug_str_offsets_segment2_end: + + .section __DWARF,__debug_abbrev,regular,debug +Lsection_abbrev: + .byte 0x01 # Abbrev code + .byte 0x11 # DW_TAG_compile_unit + .byte 0x01 # DW_CHILDREN_yes + .byte 0x25 # DW_AT_producer + .byte 0x1a # DW_FORM_strx + .byte 0x03 # DW_AT_name + .byte 0x1a # DW_FORM_strx + .byte 0x72 # DW_AT_str_offsets_base + .byte 0x17 # DW_FORM_sec_offset + .byte 0x1b # DW_AT_comp_dir + .byte 0x1a # DW_FORM_strx + .byte 0x00 # EOM(1) + .byte 0x00 # EOM(2) + .byte 0x02 # Abbrev code + .byte 0x41 # DW_TAG_type_unit + .byte 0x01 # DW_CHILDREN_yes + .byte 0x03 # DW_AT_name + .byte 0x1a # DW_FORM_strx + .byte 0x72 # DW_AT_str_offsets_base + .byte 0x17 # DW_FORM_sec_offset + .byte 0x00 # EOM(1) + .byte 0x00 # EOM(2) + .byte 0x03 # Abbrev code + .byte 0x13 # DW_TAG_structure_type + .byte 0x00 # DW_CHILDREN_no (no members) + .byte 0x03 # DW_AT_name + .byte 0x1a # DW_FORM_strx + .byte 0x00 # EOM(1) + .byte 0x00 # EOM(2) + .byte 0x04 # Abbrev code + .byte 0x2e # DW_TAG_subprogram + .byte 0x01 # DW_CHILDREN_yes + .byte 0x03 # DW_AT_name + .byte 0x25 # DW_FORM_strx1 + .byte 0x00 # EOM(1) + .byte 0x00 # EOM(2) + .byte 0x05 # Abbrev code + .byte 0x34 # DW_TAG_variable + .byte 0x00 # DW_CHILDREN_no + .byte 0x03 # DW_AT_name + .byte 0x26 # DW_FORM_strx2 + .byte 0x00 # EOM(1) + .byte 0x00 # EOM(2) + .byte 0x06 # Abbrev code + .byte 0x34 # DW_TAG_variable + .byte 0x00 # DW_CHILDREN_no + .byte 0x03 # DW_AT_name + .byte 0x27 # DW_FORM_strx3 + .byte 0x00 # EOM(1) + .byte 0x00 # EOM(2) + .byte 0x07 # Abbrev code + .byte 0x34 # DW_TAG_variable + .byte 0x00 # DW_CHILDREN_no + .byte 0x03 # DW_AT_name + .byte 0x28 # DW_FORM_strx4 + .byte 0x00 # EOM(1) + .byte 0x00 # EOM(2) + .byte 0x00 # EOM(3) + + .section __DWARF,__debug_info,regular,debug +Lsection_info: +# DWARF v5 CU header. + .long CU1_5_end-CU1_5_version # Length of Unit +CU1_5_version: + .short 5 # DWARF version number + .byte 1 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long 0 # Offset Into Abbrev. Section +# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name, +# DW_AT_str_offsets and DW_AT_compdir. + .byte 1 # Abbreviation code + .byte 0 # The index of the producer string + .byte 1 # The index of the CU name string + .long Ldebug_str_offsets_base0-Ldebug_str_offsets + .byte 2 # The index of the comp dir string +# A subprogram DIE with DW_AT_name, using DW_FORM_strx1. + .byte 4 # Abbreviation code + .byte 3 # Subprogram name string (DW_FORM_strx1) +# A variable DIE with DW_AT_name, using DW_FORM_strx2. + .byte 5 # Abbreviation code + .short 0x0004 # Subprogram name string (DW_FORM_strx2) +# A variable DIE with DW_AT_name, using DW_FORM_strx3. + .byte 6 # Abbreviation code + .byte 5 # Subprogram name string (DW_FORM_strx3) + .short 0 # Subprogram name string (DW_FORM_strx3) +# A variable DIE with DW_AT_name, using DW_FORM_strx4. + .byte 7 # Abbreviation code + .quad 0x00000006 # Subprogram name string (DW_FORM_strx4) + .byte 0 # NULL + .byte 0 # NULL + .byte 0 # NULL +CU1_5_end: + +# DWARF v5 CU header + .long CU2_5_end-CU2_5_version # Length of Unit +CU2_5_version: + .short 5 # DWARF version number + .byte 1 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long 0 # Offset Into Abbrev. Section +# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name, +# DW_AT_str_offsets and DW_AT_compdir. + .byte 1 # Abbreviation code + .byte 0 # The index of the producer string + .byte 1 # The index of the CU name string + .long Ldebug_str_offsets_base1-Ldebug_str_offsets + .byte 2 # The index of the comp dir string + .byte 0 # NULL +CU2_5_end: + + .section __DWARF,__debug_types,regular,debug +# DWARF v5 Type unit header. +TU_5_start: + .long TU_5_end-TU_5_version # Length of Unit +TU_5_version: + .short 5 # DWARF version number + .byte 2 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long 0 # Offset Into Abbrev. Section + .quad 0x0011223344556677 # Type Signature + .long TU_5_type-TU_5_start # Type offset +# The type-unit DIE, which has a name. + .byte 2 # Abbreviation code + .byte 0 # Index of the unit type name string + .long Ldebug_str_offsets_base2-Ldebug_str_offsets # offset into the str_offsets section +# The type DIE, which has a name. +TU_5_type: + .byte 3 # Abbreviation code + .byte 1 # Index of the type name string + .byte 0 # NULL + .byte 0 # NULL +TU_5_end: + + +.subsections_via_symbols diff --git a/test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64 space b/test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64-space Binary files differindex 7330cd8baa1e9..7330cd8baa1e9 100755 --- a/test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64 space +++ b/test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64-space diff --git a/test/DebugInfo/PDB/Inputs/every-type.cpp b/test/DebugInfo/PDB/Inputs/every-type.cpp new file mode 100644 index 0000000000000..ed715b0343001 --- /dev/null +++ b/test/DebugInfo/PDB/Inputs/every-type.cpp @@ -0,0 +1,63 @@ +// Build with "cl.exe /Zi /GR- /GX- every-type.cpp /link /debug /nodefaultlib /entry:main" + +// clang-format off +void *__purecall = 0; + +void __cdecl operator delete(void *,unsigned int) {} + +struct FooStruct { }; // LF_STRUCTURE + +class FooClass { // LF_CLASS + // LF_FIELDLIST + enum NestedEnum { // LF_ENUM + // LF_NESTTYPE + A, B, C // LF_ENUMERATE + }; + + void RegularMethod() {} // LF_ARGLIST + // LF_ONEMETHOD + // LF_MFUNCTION + + void OverloadedMethod(int) {} // LF_METHODLIST + // LF_METHOD + void OverloadedMethod(int, int) {} + + int HiNibble : 4; // LF_BITFIELD + int LoNibble : 4; + NestedEnum EnumVariable; // LF_MEMBER + static void *StaticMember; // LF_POINTER + // LF_STMEMBER +}; + +void *FooClass::StaticMember = nullptr; + +class Inherit : public FooClass { // LF_BCLASS +public: + virtual ~Inherit() {} // LF_VTSHAPE + // LF_VFUNCTAB +}; + +class VInherit : public virtual FooClass { // LF_VBCLASS + +}; + +class IVInherit : public VInherit { // LF_IVBCLASS +}; + +union TheUnion { + int X; // LF_UNION +}; + +int SomeArray[7] = {1, 2, 3, 4, 5, 6, 7}; // LF_ARRAY + +int main(int argc, char **argv) { // LF_PROCEDURE + const int X = 7; // LF_MODIFIER + + FooStruct FooStructInstance; + FooClass FooClassInstance; + Inherit InheritInstance; + VInherit VInheritInstance; + IVInherit IVInheritInstance; + TheUnion UnionInstance; + return SomeArray[argc]; +} diff --git a/test/DebugInfo/PDB/Inputs/every-type.pdb b/test/DebugInfo/PDB/Inputs/every-type.pdb Binary files differnew file mode 100644 index 0000000000000..64996d61d3e72 --- /dev/null +++ b/test/DebugInfo/PDB/Inputs/every-type.pdb diff --git a/test/DebugInfo/PDB/Inputs/every-type.yaml b/test/DebugInfo/PDB/Inputs/every-type.yaml new file mode 100644 index 0000000000000..8f23e8ad5e894 --- /dev/null +++ b/test/DebugInfo/PDB/Inputs/every-type.yaml @@ -0,0 +1,272 @@ +--- +TpiStream: + Records: + # int* [Index: 0x1000] + - Kind: LF_POINTER + Pointer: + ReferentType: 116 # int + Attrs: 32778 + # const int* [Index: 0x1001] + - Kind: LF_MODIFIER + Modifier: + ModifiedType: 0x1000 + Modifiers: [ Const ] + # char* [Index: 0x1002] + - Kind: LF_POINTER + Pointer: + ReferentType: 1136 # char* + Attrs: 32778 + # (int, char **) [Index: 0x1003] + - Kind: LF_ARGLIST + ArgList: + ArgIndicies: [ 116, 0x1002 ] + # (int, double) [Index: 0x1004] + - Kind: LF_ARGLIST + ArgList: + ArgIndicies: [ 116, 65 ] # (int, double) + # int main(int argc, char **argv) [Index: 0x1005] + - Kind: LF_PROCEDURE + Procedure: + ReturnType: 117 # int + CallConv: NearC # __cdecl + Options: [ None ] + ParameterCount: 2 + ArgumentList: 0x1003 # (int, char**) + # <label> [Index: 0x1006] + - Kind: LF_LABEL + Label: + Mode: Near + # <forward decl> + # class FooClass; [Index: 0x1007] + - Kind: LF_STRUCTURE + Class: + MemberCount: 0 + Options: [ None, ForwardReference ] + FieldList: 0 + Name: 'FooClass' + DerivationList: 0 + VTableShape: 0 + Size: 0 + # char* [Index: 0x1008] + - Kind: LF_POINTER + Pointer: + ReferentType: 0x1007 # FooClass + Attrs: 33802 # const + # int (FooClass::)(int, char **) [Index: 0x1009] + - Kind: LF_MFUNCTION + MemberFunction: + ReturnType: 116 # int + ClassType: 0x1007 # FooClass + ThisType: 0x1008 # const FooClass* + CallConv: ThisCall + Options: [ None ] + ParameterCount: 2 + ArgumentList: 0x1003 # (int, char**) + ThisPointerAdjustment: 0 + # int (FooClass::)(int, double) [Index: 0x100A] + - Kind: LF_MFUNCTION + MemberFunction: + ReturnType: 116 # int + ClassType: 0x1007 # FooClass + ThisType: 0x1008 # const FooClass* + CallConv: ThisCall + Options: [ None ] + ParameterCount: 2 + ArgumentList: 0x1004 # (int, double) + ThisPointerAdjustment: 0 + # <method overload list> + # int (FooClass::)(int, char **) + # int (FooClass::)(int, double) [Index: 0x100B] + - Kind: LF_METHODLIST + MethodOverloadList: + Methods: + - Type: 0x1009 # int (FooClass::)(int, char **) + Attrs: 3 # public + VFTableOffset: -1 + Name: '' + - Type: 0x100A # int (FooClass::)(int, double) + Attrs: 3 # public + VFTableOffset: -1 + Name: '' + # <Field List> + # A, B, C [Index: 0x100C] + - Kind: LF_FIELDLIST + FieldList: + - Kind: LF_ENUMERATE + Enumerator: + Attrs: 3 + Value: 0 + Name: A + - Kind: LF_ENUMERATE + Enumerator: + Attrs: 3 + Value: 1 + Name: B + - Kind: LF_ENUMERATE + Enumerator: + Attrs: 3 + Value: 2 + Name: C + # enum FooClass::Enum : uint32_t { + # A, B, C + # }; [Index: 0x100D] + - Kind: LF_ENUM + Enum: + NumEnumerators: 3 + Options: [ None, Nested ] + FieldList: 0x100C + Name: 'FooClass::Enum' + UnderlyingType: 117 + # <Field List> + # public: + # enum FooEnum : uint32_t { + # A, B, C + # }; + # FooEnum EnumMember; + # static int StaticInt; + # int FooClass::OverloadedMethod(int, char **); + # int FooClass::OverloadedMethod(int, double); + # int FooClass::RegularMethod(int, double); + # [Index: 0x100E] + - Kind: LF_FIELDLIST + FieldList: + # enum FooEnum : uint32_t { + # A, B, C + # }; + - Kind: LF_NESTTYPE + NestedType: + Type: 0x100D + Name: FooEnum + # FooEnum EnumMember; + - Kind: LF_MEMBER + DataMember: + Attrs: 3 # public + Type: 0x100D # void* + FieldOffset: 0 + Name: EnumMember + # static int StaticInt; + - Kind: LF_STMEMBER + StaticDataMember: + Attrs: 3 # public + Type: 116 # int + Name: StaticInt + # int FooClass::OverloadedMethod(int, char **); + # int FooClass::OverloadedMethod(int, double); + - Kind: LF_METHOD + OverloadedMethod: + NumOverloads: 2 + MethodList: 0x100B + Name: OverloadedMethod + # int FooClass::RegularMethod(int, double); + - Kind: LF_ONEMETHOD + OneMethod: + Type: 0x100A + Attrs: 3 # public + VFTableOffset: -1 + Name: RegularMethod + # class FooClass { + # public: + # enum FooEnum : uint32_t { + # A, B, C + # }; + # FooEnum EnumMember; + # static int StaticInt; + # int FooClass::OverloadedMethod(int, char **); + # int FooClass::OverloadedMethod(int, double); + # int FooClass::RegularMethod(int, double); + # }; [Index: 0x100F] + - Kind: LF_CLASS + Class: + MemberCount: 6 + Options: [ None ] + FieldList: 0x100E + Name: 'FooClass' + DerivationList: 0 + VTableShape: 0 + Size: 4 + # struct FooStructure; [Index: 0x1010] + - Kind: LF_STRUCTURE + Class: + MemberCount: 6 + Options: [ None ] + FieldList: 0x100E + Name: 'FooStructure' + DerivationList: 0 + VTableShape: 0 + Size: 4 + # interface FooInterface; [Index: 0x1011] + - Kind: LF_INTERFACE + Class: + MemberCount: 6 + Options: [ None ] + FieldList: 0x100E + Name: 'FooInterface' + DerivationList: 0 + VTableShape: 0 + Size: 4 + # <field list> + # : public FooClass [Index: 0x1012] + - Kind: LF_FIELDLIST + FieldList: + - Kind: LF_BCLASS + Attrs: 3 # public + Type: 0x100F # FooClass + Offset: 0 + # <field list> + # : public virtual FooClass [Index: 0x1013] + - Kind: LF_FIELDLIST + FieldList: + - Kind: LF_VBCLASS + Attrs: 3 # public + BaseType: 0x100F # FooClass + VBPtrType: 0x1001 # const int * + VBPtrOffset: 0 + VTableIndex: 1 + # class Inherit : public FooClass {}; [Index: 0x1014] + - Kind: LF_STRUCTURE + Class: + MemberCount: 1 + Options: [ None ] + FieldList: 0x100E + Name: 'Inherit' + DerivationList: 0x1012 + VTableShape: 0 + Size: 4 + # class VInherit : public virtual FooClass {}; [Index: 0x1015] + - Kind: LF_STRUCTURE + Class: + MemberCount: 1 + Options: [ None ] + FieldList: 0x100E + Name: 'Inherit' + DerivationList: 0x1012 + VTableShape: 0 + Size: 4 + +# // Member type records. These are generally not length prefixed, and appear +# // inside of a field list record. +# MEMBER_RECORD(LF_VFUNCTAB, 0x1409, VFPtr) + +# MEMBER_RECORD_ALIAS(LF_BINTERFACE, 0x151a, BaseInterface, BaseClass) + +# MEMBER_RECORD_ALIAS(LF_IVBCLASS, 0x1402, IndirectVirtualBaseClass, +# VirtualBaseClass) + + +# TYPE_RECORD(LF_ARRAY, 0x1503, Array) +# TYPE_RECORD(LF_UNION, 0x1506, Union) +# TYPE_RECORD(LF_TYPESERVER2, 0x1515, TypeServer2) +# TYPE_RECORD(LF_VFTABLE, 0x151d, VFTable) +# TYPE_RECORD(LF_VTSHAPE, 0x000a, VFTableShape) + +# TYPE_RECORD(LF_BITFIELD, 0x1205, BitField) + + +# // ID leaf records. Subsequent leaf types may be referenced from .debug$S. +# TYPE_RECORD(LF_FUNC_ID, 0x1601, FuncId) +# TYPE_RECORD(LF_MFUNC_ID, 0x1602, MemberFuncId) +# TYPE_RECORD(LF_BUILDINFO, 0x1603, BuildInfo) +# TYPE_RECORD(LF_SUBSTR_LIST, 0x1604, StringList) +# TYPE_RECORD(LF_STRING_ID, 0x1605, StringId) +# TYPE_RECORD(LF_UDT_SRC_LINE, 0x1606, UdtSourceLine) +# TYPE_RECORD(LF_UDT_MOD_SRC_LINE, 0x1607, UdtModSourceLine) diff --git a/test/DebugInfo/PDB/every-type.test b/test/DebugInfo/PDB/every-type.test new file mode 100644 index 0000000000000..e6b9c15815d03 --- /dev/null +++ b/test/DebugInfo/PDB/every-type.test @@ -0,0 +1,261 @@ +The test input (every-type.pdb) is generated from some short and trivial C++ code +that exercises the entire type system to generate every possible type record that +we claim to understand. We then test this in two ways: + 1) We just dump the output for the purposes of readability. This tests that we + we can dump every possible type record. + 2) We dump the output to yaml, and then re-generate a PDB with the same type + stream, and then run test 1 on the new PDB. This verifies that the PDB + hasn't changed. + + +RUN: llvm-pdbutil dump -type-index=0x1018,0x102A,0x103B,0x1093,0x1095,0x1096,0x1098 \ +RUN: -dependents %p/Inputs/every-type.pdb | FileCheck --check-prefix=TYPES %s + +RUN: llvm-pdbutil pdb2yaml -tpi-stream -ipi-stream %p/Inputs/every-type.pdb > %t.pdb.yaml +RUN: llvm-pdbutil yaml2pdb -pdb=%t.yaml.pdb %t.pdb.yaml +RUN: llvm-pdbutil dump -type-index=0x1018,0x102A,0x103B,0x1093,0x1095,0x1096,0x1098 \ +RUN: -dependents %t.yaml.pdb | FileCheck --check-prefix=TYPES %s + +TYPES: Types (TPI Stream) +TYPES-NEXT: ============================================================ +TYPES-NEXT: Showing 7 records and their dependents (73 records total) +TYPES-NEXT: 0x1005 | LF_MODIFIER [size = 12] +TYPES-NEXT: referent = 0x0074 (int), modifiers = const +TYPES-NEXT: 0x1006 | LF_CLASS [size = 48] `FooClass` +TYPES-NEXT: unique name: `.?AVFooClass@@` +TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: <no type> +TYPES-NEXT: options: forward ref | has unique name +TYPES-NEXT: 0x1007 | LF_VTSHAPE [size = 8] +TYPES-NEXT: 0x1008 | LF_POINTER [size = 12] +TYPES-NEXT: referent = 0x1007, mode = pointer, opts = None, kind = ptr32 +TYPES-NEXT: 0x1009 | LF_CLASS [size = 44] `Inherit` +TYPES-NEXT: unique name: `.?AVInherit@@` +TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: <no type> +TYPES-NEXT: options: forward ref | has unique name +TYPES-NEXT: 0x100A | LF_POINTER [size = 12] +TYPES-NEXT: referent = 0x1009, mode = pointer, opts = const, kind = ptr32 +TYPES-NEXT: 0x100B | LF_ARGLIST [size = 8] +TYPES-NEXT: 0x100C | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x100B +TYPES-NEXT: class type = 0x1009, this type = 0x100A, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = None +TYPES-NEXT: 0x100D | LF_MODIFIER [size = 12] +TYPES-NEXT: referent = 0x1009, modifiers = const +TYPES-NEXT: 0x100E | LF_POINTER [size = 12] +TYPES-NEXT: referent = 0x100D, mode = ref, opts = None, kind = ptr32 +TYPES-NEXT: 0x100F | LF_ARGLIST [size = 12] +TYPES-NEXT: 0x100E: `const Inherit&` +TYPES-NEXT: 0x1010 | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x100F +TYPES-NEXT: class type = 0x1009, this type = 0x100A, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = constructor +TYPES-NEXT: 0x1011 | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x100B +TYPES-NEXT: class type = 0x1009, this type = 0x100A, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = constructor +TYPES-NEXT: 0x1012 | LF_METHODLIST [size = 20] +TYPES-NEXT: - Method [type = 0x1010, vftable offset = -1, attrs = public compiler-generated] +TYPES-NEXT: - Method [type = 0x1011, vftable offset = -1, attrs = public compiler-generated] +TYPES-NEXT: 0x1013 | LF_POINTER [size = 12] +TYPES-NEXT: referent = 0x1009, mode = ref, opts = None, kind = ptr32 +TYPES-NEXT: 0x1014 | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x1013, # args = 1, param list = 0x100F +TYPES-NEXT: class type = 0x1009, this type = 0x100A, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = None +TYPES-NEXT: 0x1015 | LF_ARGLIST [size = 12] +TYPES-NEXT: 0x0075 (unsigned): `unsigned` +TYPES-NEXT: 0x1016 | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x0403 (void*), # args = 1, param list = 0x1015 +TYPES-NEXT: class type = 0x1009, this type = 0x100A, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = None +TYPES-NEXT: 0x1017 | LF_FIELDLIST [size = 152] +TYPES-NEXT: - LF_BCLASS +TYPES-NEXT: type = 0x1006, offset = 4, attrs = public +TYPES-NEXT: - LF_VFUNCTAB type = 0x1008 +TYPES-NEXT: - LF_ONEMETHOD [name = `~Inherit`] +TYPES-NEXT: type = 0x100C, vftable offset = 0, attrs = public intro virtual +TYPES-NEXT: - LF_METHOD [name = `Inherit`, # overloads = 2, overload list = 0x1012] +TYPES-NEXT: - LF_ONEMETHOD [name = `operator=`] +TYPES-NEXT: type = 0x1014, vftable offset = -1, attrs = public compiler-generated +TYPES-NEXT: - LF_ONEMETHOD [name = `__local_vftable_ctor_closure`] +TYPES-NEXT: type = 0x100C, vftable offset = -1, attrs = public compiler-generated +TYPES-NEXT: - LF_ONEMETHOD [name = `__vecDelDtor`] +TYPES-NEXT: type = 0x1016, vftable offset = 0, attrs = public intro virtual compiler-generated +TYPES-NEXT: 0x1018 | LF_CLASS [size = 44] `Inherit` +TYPES-NEXT: unique name: `.?AVInherit@@` +TYPES-NEXT: vtable: 0x1007, base list: <no type>, field list: 0x1017 +TYPES-NEXT: options: has ctor / dtor | has unique name | overloaded operator | overloaded operator= +TYPES-NEXT: 0x1019 | LF_POINTER [size = 12] +TYPES-NEXT: referent = 0x1005, mode = pointer, opts = None, kind = ptr32 +TYPES-NEXT: 0x101A | LF_CLASS [size = 48] `VInherit` +TYPES-NEXT: unique name: `.?AVVInherit@@` +TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: <no type> +TYPES-NEXT: options: forward ref | has unique name +TYPES-NEXT: 0x101B | LF_POINTER [size = 12] +TYPES-NEXT: referent = 0x101A, mode = pointer, opts = const, kind = ptr32 +TYPES-NEXT: 0x101C | LF_POINTER [size = 12] +TYPES-NEXT: referent = 0x101A, mode = rvalue ref, opts = None, kind = ptr32 +TYPES-NEXT: 0x101D | LF_ARGLIST [size = 12] +TYPES-NEXT: 0x101C: `VInherit&&` +TYPES-NEXT: 0x101E | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x101D +TYPES-NEXT: class type = 0x101A, this type = 0x101B, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = constructor with virtual bases | constructor +TYPES-NEXT: 0x101F | LF_MODIFIER [size = 12] +TYPES-NEXT: referent = 0x101A, modifiers = const +TYPES-NEXT: 0x1020 | LF_POINTER [size = 12] +TYPES-NEXT: referent = 0x101F, mode = ref, opts = None, kind = ptr32 +TYPES-NEXT: 0x1021 | LF_ARGLIST [size = 12] +TYPES-NEXT: 0x1020: `const VInherit&` +TYPES-NEXT: 0x1022 | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x1021 +TYPES-NEXT: class type = 0x101A, this type = 0x101B, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = constructor with virtual bases | constructor +TYPES-NEXT: 0x1023 | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x100B +TYPES-NEXT: class type = 0x101A, this type = 0x101B, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = constructor with virtual bases | constructor +TYPES-NEXT: 0x1024 | LF_METHODLIST [size = 28] +TYPES-NEXT: - Method [type = 0x101E, vftable offset = -1, attrs = public compiler-generated] +TYPES-NEXT: - Method [type = 0x1022, vftable offset = -1, attrs = public compiler-generated] +TYPES-NEXT: - Method [type = 0x1023, vftable offset = -1, attrs = public compiler-generated] +TYPES-NEXT: 0x1025 | LF_POINTER [size = 12] +TYPES-NEXT: referent = 0x101A, mode = ref, opts = None, kind = ptr32 +TYPES-NEXT: 0x1026 | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x1025, # args = 1, param list = 0x101D +TYPES-NEXT: class type = 0x101A, this type = 0x101B, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = None +TYPES-NEXT: 0x1027 | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x1025, # args = 1, param list = 0x1021 +TYPES-NEXT: class type = 0x101A, this type = 0x101B, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = None +TYPES-NEXT: 0x1028 | LF_METHODLIST [size = 20] +TYPES-NEXT: - Method [type = 0x1026, vftable offset = -1, attrs = public compiler-generated] +TYPES-NEXT: - Method [type = 0x1027, vftable offset = -1, attrs = public compiler-generated] +TYPES-NEXT: 0x1029 | LF_FIELDLIST [size = 60] +TYPES-NEXT: - LF_VBCLASS +TYPES-NEXT: base = 0x1006, vbptr = 0x1019, vbptr offset = 0, vtable index = 1 +TYPES-NEXT: attrs = public +TYPES-NEXT: - LF_METHOD [name = `VInherit`, # overloads = 3, overload list = 0x1024] +TYPES-NEXT: - LF_METHOD [name = `operator=`, # overloads = 2, overload list = 0x1028] +TYPES-NEXT: 0x102A | LF_CLASS [size = 48] `VInherit` +TYPES-NEXT: unique name: `.?AVVInherit@@` +TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1029 +TYPES-NEXT: options: has ctor / dtor | has unique name | overloaded operator | overloaded operator= +TYPES-NEXT: 0x102B | LF_CLASS [size = 48] `IVInherit` +TYPES-NEXT: unique name: `.?AVIVInherit@@` +TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: <no type> +TYPES-NEXT: options: forward ref | has unique name +TYPES-NEXT: 0x102C | LF_POINTER [size = 12] +TYPES-NEXT: referent = 0x102B, mode = pointer, opts = const, kind = ptr32 +TYPES-NEXT: 0x102D | LF_POINTER [size = 12] +TYPES-NEXT: referent = 0x102B, mode = rvalue ref, opts = None, kind = ptr32 +TYPES-NEXT: 0x102E | LF_ARGLIST [size = 12] +TYPES-NEXT: 0x102D: `IVInherit&&` +TYPES-NEXT: 0x102F | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x102E +TYPES-NEXT: class type = 0x102B, this type = 0x102C, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = constructor with virtual bases | constructor +TYPES-NEXT: 0x1030 | LF_MODIFIER [size = 12] +TYPES-NEXT: referent = 0x102B, modifiers = const +TYPES-NEXT: 0x1031 | LF_POINTER [size = 12] +TYPES-NEXT: referent = 0x1030, mode = ref, opts = None, kind = ptr32 +TYPES-NEXT: 0x1032 | LF_ARGLIST [size = 12] +TYPES-NEXT: 0x1031: `const IVInherit&` +TYPES-NEXT: 0x1033 | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x1032 +TYPES-NEXT: class type = 0x102B, this type = 0x102C, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = constructor with virtual bases | constructor +TYPES-NEXT: 0x1034 | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x100B +TYPES-NEXT: class type = 0x102B, this type = 0x102C, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = constructor with virtual bases | constructor +TYPES-NEXT: 0x1035 | LF_METHODLIST [size = 28] +TYPES-NEXT: - Method [type = 0x102F, vftable offset = -1, attrs = public compiler-generated] +TYPES-NEXT: - Method [type = 0x1033, vftable offset = -1, attrs = public compiler-generated] +TYPES-NEXT: - Method [type = 0x1034, vftable offset = -1, attrs = public compiler-generated] +TYPES-NEXT: 0x1036 | LF_POINTER [size = 12] +TYPES-NEXT: referent = 0x102B, mode = ref, opts = None, kind = ptr32 +TYPES-NEXT: 0x1037 | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x1036, # args = 1, param list = 0x102E +TYPES-NEXT: class type = 0x102B, this type = 0x102C, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = None +TYPES-NEXT: 0x1038 | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x1036, # args = 1, param list = 0x1032 +TYPES-NEXT: class type = 0x102B, this type = 0x102C, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = None +TYPES-NEXT: 0x1039 | LF_METHODLIST [size = 20] +TYPES-NEXT: - Method [type = 0x1037, vftable offset = -1, attrs = public compiler-generated] +TYPES-NEXT: - Method [type = 0x1038, vftable offset = -1, attrs = public compiler-generated] +TYPES-NEXT: 0x103A | LF_FIELDLIST [size = 72] +TYPES-NEXT: - LF_BCLASS +TYPES-NEXT: type = 0x101A, offset = 0, attrs = public +TYPES-NEXT: - LF_IVBCLASS +TYPES-NEXT: base = 0x1006, vbptr = 0x1019, vbptr offset = 0, vtable index = 1 +TYPES-NEXT: attrs = public +TYPES-NEXT: - LF_METHOD [name = `IVInherit`, # overloads = 3, overload list = 0x1035] +TYPES-NEXT: - LF_METHOD [name = `operator=`, # overloads = 2, overload list = 0x1039] +TYPES-NEXT: 0x103B | LF_CLASS [size = 48] `IVInherit` +TYPES-NEXT: unique name: `.?AVIVInherit@@` +TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: 0x103A +TYPES-NEXT: options: has ctor / dtor | has unique name | overloaded operator | overloaded operator= +TYPES-NEXT: 0x1087 | LF_FIELDLIST [size = 28] +TYPES-NEXT: - LF_ENUMERATE [A = 0] +TYPES-NEXT: - LF_ENUMERATE [B = 1] +TYPES-NEXT: - LF_ENUMERATE [C = 2] +TYPES-NEXT: 0x1088 | LF_ENUM [size = 64] `FooClass::NestedEnum` +TYPES-NEXT: unique name: `.?AW4NestedEnum@FooClass@@` +TYPES-NEXT: field list: 0x1087, underlying type: 0x0074 (int) +TYPES-NEXT: options: has unique name | is nested +TYPES-NEXT: 0x1089 | LF_POINTER [size = 12] +TYPES-NEXT: referent = 0x1006, mode = pointer, opts = const, kind = ptr32 +TYPES-NEXT: 0x108A | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x100B +TYPES-NEXT: class type = 0x1006, this type = 0x1089, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = None +TYPES-NEXT: 0x108B | LF_ARGLIST [size = 16] +TYPES-NEXT: 0x0074 (int): `int` +TYPES-NEXT: 0x0074 (int): `int` +TYPES-NEXT: 0x108C | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x0003 (void), # args = 2, param list = 0x108B +TYPES-NEXT: class type = 0x1006, this type = 0x1089, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = None +TYPES-NEXT: 0x108D | LF_ARGLIST [size = 12] +TYPES-NEXT: 0x0074 (int): `int` +TYPES-NEXT: 0x108E | LF_MFUNCTION [size = 28] +TYPES-NEXT: return type = 0x0003 (void), # args = 1, param list = 0x108D +TYPES-NEXT: class type = 0x1006, this type = 0x1089, this adjust = 0 +TYPES-NEXT: calling conv = thiscall, options = None +TYPES-NEXT: 0x108F | LF_METHODLIST [size = 20] +TYPES-NEXT: - Method [type = 0x108C, vftable offset = -1, attrs = private] +TYPES-NEXT: - Method [type = 0x108E, vftable offset = -1, attrs = private] +TYPES-NEXT: 0x1090 | LF_BITFIELD [size = 12] +TYPES-NEXT: type = 0x0074 (int), bit offset = 0, # bits = 4 +TYPES-NEXT: 0x1091 | LF_BITFIELD [size = 12] +TYPES-NEXT: type = 0x0074 (int), bit offset = 4, # bits = 4 +TYPES-NEXT: 0x1092 | LF_FIELDLIST [size = 164] +TYPES-NEXT: - LF_NESTTYPE [name = `NestedEnum`, parent = 0x1088] +TYPES-NEXT: - LF_ONEMETHOD [name = `RegularMethod`] +TYPES-NEXT: type = 0x108A, vftable offset = -1, attrs = private +TYPES-NEXT: - LF_METHOD [name = `OverloadedMethod`, # overloads = 2, overload list = 0x108F] +TYPES-NEXT: - LF_MEMBER [name = `HiNibble`, Type = 0x1090, offset = 0, attrs = private] +TYPES-NEXT: - LF_MEMBER [name = `LoNibble`, Type = 0x1091, offset = 0, attrs = private] +TYPES-NEXT: - LF_MEMBER [name = `EnumVariable`, Type = 0x1088, offset = 4, attrs = private] +TYPES-NEXT: - LF_STMEMBER [name = `StaticMember`, type = 0x0403 (void*), attrs = private] +TYPES-NEXT: 0x1093 | LF_CLASS [size = 48] `FooClass` +TYPES-NEXT: unique name: `.?AVFooClass@@` +TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1092 +TYPES-NEXT: options: contains nested class | has unique name +TYPES-NEXT: 0x1094 | LF_FIELDLIST [size = 16] +TYPES-NEXT: - LF_MEMBER [name = `X`, Type = 0x0074 (int), offset = 0, attrs = public] +TYPES-NEXT: 0x1095 | LF_UNION [size = 40] `TheUnion` +TYPES-NEXT: unique name: `.?ATTheUnion@@` +TYPES-NEXT: field list: 0x1094 +TYPES-NEXT: options: has unique name | sealed +TYPES-NEXT: 0x1096 | LF_PROCEDURE [size = 16] +TYPES-NEXT: return type = 0x0003 (void), # args = 0, param list = 0x100B +TYPES-NEXT: calling conv = cdecl, options = None +TYPES-NEXT: 0x1097 | LF_POINTER [size = 12] +TYPES-NEXT: referent = 0x1096, mode = pointer, opts = const, kind = ptr32 +TYPES-NEXT: 0x1098 | LF_ARRAY [size = 16] +TYPES-NEXT: size: 4, index type: 0x0022 (unsigned long), element type: 0x1097 diff --git a/test/DebugInfo/PDB/pdbdump-headers.test b/test/DebugInfo/PDB/pdbdump-headers.test index 3b7895e06b77d..1887af2e82683 100644 --- a/test/DebugInfo/PDB/pdbdump-headers.test +++ b/test/DebugInfo/PDB/pdbdump-headers.test @@ -67,9 +67,11 @@ ALL-NEXT: ============================================================ ALL-NEXT: Mod 0000 | Name: `d:\src\llvm\test\DebugInfo\PDB\Inputs\empty.obj`: ALL-NEXT: Obj: `d:\src\llvm\test\DebugInfo\PDB\Inputs\empty.obj`: ALL-NEXT: debug stream: 12, # files: 1, has ec info: false +ALL-NEXT: pdb file ni: 0 ``, src file ni: 0 `` ALL-NEXT: Mod 0001 | Name: `* Linker *`: ALL-NEXT: Obj: ``: ALL-NEXT: debug stream: 14, # files: 0, has ec info: false +ALL-NEXT: pdb file ni: 1 `{{.*empty.pdb}}`, src file ni: 0 `` ALL: Files ALL-NEXT: ============================================================ ALL-NEXT: Mod 0000 | `d:\src\llvm\test\DebugInfo\PDB\Inputs\empty.obj`: @@ -99,13 +101,11 @@ ALL-NEXT: - LF_ENUMERATE [single = 2] ALL-NEXT: - LF_ENUMERATE [free = 3] ALL-NEXT: - LF_ENUMERATE [neutral = 4] ALL-NEXT: - LF_ENUMERATE [both = 5] -ALL-NEXT: 0x1003 | LF_ENUM [size = 120, hash = 208239] -ALL-NEXT: name: `__vc_attributes::threadingAttribute::threading_e` +ALL-NEXT: 0x1003 | LF_ENUM [size = 120, hash = 208239] `__vc_attributes::threadingAttribute::threading_e` ALL-NEXT: unique name: `.?AW4threading_e@threadingAttribute@__vc_attributes@@` ALL-NEXT: field list: 0x1002, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested -ALL-NEXT: 0x1004 | LF_STRUCTURE [size = 100, hash = 16377] -ALL-NEXT: class name: `__vc_attributes::threadingAttribute` +ALL-NEXT: 0x1004 | LF_STRUCTURE [size = 100, hash = 16377] `__vc_attributes::threadingAttribute` ALL-NEXT: unique name: `.?AUthreadingAttribute@__vc_attributes@@` ALL-NEXT: vtable: <no type>, base list: <no type>, field list: <no type> ALL-NEXT: options: forward ref | has unique name @@ -128,8 +128,7 @@ ALL-NEXT: 0x100A | LF_FIELDLIST [size = 68, hash = 185421] ALL-NEXT: - LF_NESTTYPE [name = `threading_e`, parent = 0x1003] ALL-NEXT: - LF_METHOD [name = `threadingAttribute`, # overloads = 2, overload list = 0x1009] ALL-NEXT: - LF_MEMBER [name = `value`, Type = 0x1003, offset = 0, attrs = public] -ALL-NEXT: 0x100B | LF_STRUCTURE [size = 100, hash = 119540] -ALL-NEXT: class name: `__vc_attributes::threadingAttribute` +ALL-NEXT: 0x100B | LF_STRUCTURE [size = 100, hash = 119540] `__vc_attributes::threadingAttribute` ALL-NEXT: unique name: `.?AUthreadingAttribute@__vc_attributes@@` ALL-NEXT: vtable: <no type>, base list: <no type>, field list: 0x100A ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name @@ -137,13 +136,11 @@ ALL-NEXT: 0x100C | LF_FIELDLIST [size = 48, hash = 261871] ALL-NEXT: - LF_ENUMERATE [native = 0] ALL-NEXT: - LF_ENUMERATE [com = 1] ALL-NEXT: - LF_ENUMERATE [managed = 2] -ALL-NEXT: 0x100D | LF_ENUM [size = 120, hash = 198119] -ALL-NEXT: name: `__vc_attributes::event_receiverAttribute::type_e` +ALL-NEXT: 0x100D | LF_ENUM [size = 120, hash = 198119] `__vc_attributes::event_receiverAttribute::type_e` ALL-NEXT: unique name: `.?AW4type_e@event_receiverAttribute@__vc_attributes@@` ALL-NEXT: field list: 0x100C, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested -ALL-NEXT: 0x100E | LF_STRUCTURE [size = 112, hash = 48056] -ALL-NEXT: class name: `__vc_attributes::event_receiverAttribute` +ALL-NEXT: 0x100E | LF_STRUCTURE [size = 112, hash = 48056] `__vc_attributes::event_receiverAttribute` ALL-NEXT: unique name: `.?AUevent_receiverAttribute@__vc_attributes@@` ALL-NEXT: vtable: <no type>, base list: <no type>, field list: <no type> ALL-NEXT: options: forward ref | has unique name @@ -175,8 +172,7 @@ ALL-NEXT: - LF_NESTTYPE [name = `type_e`, parent = 0x100D] ALL-NEXT: - LF_METHOD [name = `event_receiverAttribute`, # overloads = 3, overload list = 0x1015] ALL-NEXT: - LF_MEMBER [name = `type`, Type = 0x100D, offset = 0, attrs = public] ALL-NEXT: - LF_MEMBER [name = `layout_dependent`, Type = 0x0030 (bool), offset = 4, attrs = public] -ALL-NEXT: 0x1017 | LF_STRUCTURE [size = 112, hash = 148734] -ALL-NEXT: class name: `__vc_attributes::event_receiverAttribute` +ALL-NEXT: 0x1017 | LF_STRUCTURE [size = 112, hash = 148734] `__vc_attributes::event_receiverAttribute` ALL-NEXT: unique name: `.?AUevent_receiverAttribute@__vc_attributes@@` ALL-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1016 ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name @@ -184,13 +180,11 @@ ALL-NEXT: 0x1018 | LF_FIELDLIST [size = 48, hash = 81128] ALL-NEXT: - LF_ENUMERATE [never = 0] ALL-NEXT: - LF_ENUMERATE [allowed = 1] ALL-NEXT: - LF_ENUMERATE [always = 2] -ALL-NEXT: 0x1019 | LF_ENUM [size = 116, hash = 60158] -ALL-NEXT: name: `__vc_attributes::aggregatableAttribute::type_e` +ALL-NEXT: 0x1019 | LF_ENUM [size = 116, hash = 60158] `__vc_attributes::aggregatableAttribute::type_e` ALL-NEXT: unique name: `.?AW4type_e@aggregatableAttribute@__vc_attributes@@` ALL-NEXT: field list: 0x1018, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested -ALL-NEXT: 0x101A | LF_STRUCTURE [size = 108, hash = 217249] -ALL-NEXT: class name: `__vc_attributes::aggregatableAttribute` +ALL-NEXT: 0x101A | LF_STRUCTURE [size = 108, hash = 217249] `__vc_attributes::aggregatableAttribute` ALL-NEXT: unique name: `.?AUaggregatableAttribute@__vc_attributes@@` ALL-NEXT: vtable: <no type>, base list: <no type>, field list: <no type> ALL-NEXT: options: forward ref | has unique name @@ -213,26 +207,22 @@ ALL-NEXT: 0x1020 | LF_FIELDLIST [size = 68, hash = 6214] ALL-NEXT: - LF_NESTTYPE [name = `type_e`, parent = 0x1019] ALL-NEXT: - LF_METHOD [name = `aggregatableAttribute`, # overloads = 2, overload list = 0x101F] ALL-NEXT: - LF_MEMBER [name = `type`, Type = 0x1019, offset = 0, attrs = public] -ALL-NEXT: 0x1021 | LF_STRUCTURE [size = 108, hash = 94935] -ALL-NEXT: class name: `__vc_attributes::aggregatableAttribute` +ALL-NEXT: 0x1021 | LF_STRUCTURE [size = 108, hash = 94935] `__vc_attributes::aggregatableAttribute` ALL-NEXT: unique name: `.?AUaggregatableAttribute@__vc_attributes@@` ALL-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1020 ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name -ALL-NEXT: 0x1022 | LF_ENUM [size = 116, hash = 151449] -ALL-NEXT: name: `__vc_attributes::event_sourceAttribute::type_e` +ALL-NEXT: 0x1022 | LF_ENUM [size = 116, hash = 151449] `__vc_attributes::event_sourceAttribute::type_e` ALL-NEXT: unique name: `.?AW4type_e@event_sourceAttribute@__vc_attributes@@` ALL-NEXT: field list: 0x100C, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested ALL-NEXT: 0x1023 | LF_FIELDLIST [size = 28, hash = 135589] ALL-NEXT: - LF_ENUMERATE [speed = 0] ALL-NEXT: - LF_ENUMERATE [size = 1] -ALL-NEXT: 0x1024 | LF_ENUM [size = 124, hash = 73373] -ALL-NEXT: name: `__vc_attributes::event_sourceAttribute::optimize_e` +ALL-NEXT: 0x1024 | LF_ENUM [size = 124, hash = 73373] `__vc_attributes::event_sourceAttribute::optimize_e` ALL-NEXT: unique name: `.?AW4optimize_e@event_sourceAttribute@__vc_attributes@@` ALL-NEXT: field list: 0x1023, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested -ALL-NEXT: 0x1025 | LF_STRUCTURE [size = 108, hash = 96512] -ALL-NEXT: class name: `__vc_attributes::event_sourceAttribute` +ALL-NEXT: 0x1025 | LF_STRUCTURE [size = 108, hash = 96512] `__vc_attributes::event_sourceAttribute` ALL-NEXT: unique name: `.?AUevent_sourceAttribute@__vc_attributes@@` ALL-NEXT: vtable: <no type>, base list: <no type>, field list: <no type> ALL-NEXT: options: forward ref | has unique name @@ -258,8 +248,7 @@ ALL-NEXT: - LF_METHOD [name = `event_sourceAttribute`, # overloads = ALL-NEXT: - LF_MEMBER [name = `type`, Type = 0x1022, offset = 0, attrs = public] ALL-NEXT: - LF_MEMBER [name = `optimize`, Type = 0x1024, offset = 4, attrs = public] ALL-NEXT: - LF_MEMBER [name = `decorate`, Type = 0x0030 (bool), offset = 8, attrs = public] -ALL-NEXT: 0x102C | LF_STRUCTURE [size = 108, hash = 238560] -ALL-NEXT: class name: `__vc_attributes::event_sourceAttribute` +ALL-NEXT: 0x102C | LF_STRUCTURE [size = 108, hash = 238560] `__vc_attributes::event_sourceAttribute` ALL-NEXT: unique name: `.?AUevent_sourceAttribute@__vc_attributes@@` ALL-NEXT: vtable: <no type>, base list: <no type>, field list: 0x102B ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name @@ -270,13 +259,11 @@ ALL-NEXT: - LF_ENUMERATE [service = 3] ALL-NEXT: - LF_ENUMERATE [unspecified = 4] ALL-NEXT: - LF_ENUMERATE [EXE = 2] ALL-NEXT: - LF_ENUMERATE [SERVICE = 3] -ALL-NEXT: 0x102E | LF_ENUM [size = 104, hash = 115151] -ALL-NEXT: name: `__vc_attributes::moduleAttribute::type_e` +ALL-NEXT: 0x102E | LF_ENUM [size = 104, hash = 115151] `__vc_attributes::moduleAttribute::type_e` ALL-NEXT: unique name: `.?AW4type_e@moduleAttribute@__vc_attributes@@` ALL-NEXT: field list: 0x102D, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested -ALL-NEXT: 0x102F | LF_STRUCTURE [size = 96, hash = 197306] -ALL-NEXT: class name: `__vc_attributes::moduleAttribute` +ALL-NEXT: 0x102F | LF_STRUCTURE [size = 96, hash = 197306] `__vc_attributes::moduleAttribute` ALL-NEXT: unique name: `.?AUmoduleAttribute@__vc_attributes@@` ALL-NEXT: vtable: <no type>, base list: <no type>, field list: <no type> ALL-NEXT: options: forward ref | has unique name @@ -338,8 +325,7 @@ ALL-NEXT: - LF_MEMBER [name = `hidden`, Type = 0x0030 (bool), offset ALL-NEXT: - LF_MEMBER [name = `restricted`, Type = 0x0030 (bool), offset = 45, attrs = public] ALL-NEXT: - LF_MEMBER [name = `custom`, Type = 0x1032, offset = 48, attrs = public] ALL-NEXT: - LF_MEMBER [name = `resource_name`, Type = 0x1032, offset = 52, attrs = public] -ALL-NEXT: 0x103A | LF_STRUCTURE [size = 96, hash = 98548] -ALL-NEXT: class name: `__vc_attributes::moduleAttribute` +ALL-NEXT: 0x103A | LF_STRUCTURE [size = 96, hash = 98548] `__vc_attributes::moduleAttribute` ALL-NEXT: unique name: `.?AUmoduleAttribute@__vc_attributes@@` ALL-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1039 ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name @@ -374,13 +360,11 @@ ALL-NEXT: - LF_ENUMERATE [eModuleUsage = 16777216] ALL-NEXT: - LF_ENUMERATE [eIllegalUsage = 33554432] ALL-NEXT: - LF_ENUMERATE [eAsynchronousUsage = 67108864] ALL-NEXT: - LF_ENUMERATE [eAnyIDLUsage = 4161535] -ALL-NEXT: 0x103C | LF_ENUM [size = 140, hash = 171328] -ALL-NEXT: name: `__vc_attributes::helper_attributes::usageAttribute::usage_e` +ALL-NEXT: 0x103C | LF_ENUM [size = 140, hash = 171328] `__vc_attributes::helper_attributes::usageAttribute::usage_e` ALL-NEXT: unique name: `.?AW4usage_e@usageAttribute@helper_attributes@__vc_attributes@@` ALL-NEXT: field list: 0x103B, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested -ALL-NEXT: 0x103D | LF_STRUCTURE [size = 128, hash = 203640] -ALL-NEXT: class name: `__vc_attributes::helper_attributes::usageAttribute` +ALL-NEXT: 0x103D | LF_STRUCTURE [size = 128, hash = 203640] `__vc_attributes::helper_attributes::usageAttribute` ALL-NEXT: unique name: `.?AUusageAttribute@helper_attributes@__vc_attributes@@` ALL-NEXT: vtable: <no type>, base list: <no type>, field list: <no type> ALL-NEXT: options: forward ref | has unique name @@ -397,8 +381,7 @@ ALL-NEXT: - LF_NESTTYPE [name = `usage_e`, parent = 0x103C] ALL-NEXT: - LF_ONEMETHOD [name = `usageAttribute`] ALL-NEXT: type = 0x1040, vftable offset = -1, attrs = public ALL-NEXT: - LF_MEMBER [name = `value`, Type = 0x0075 (unsigned), offset = 0, attrs = public] -ALL-NEXT: 0x1042 | LF_STRUCTURE [size = 128, hash = 165040] -ALL-NEXT: class name: `__vc_attributes::helper_attributes::usageAttribute` +ALL-NEXT: 0x1042 | LF_STRUCTURE [size = 128, hash = 165040] `__vc_attributes::helper_attributes::usageAttribute` ALL-NEXT: unique name: `.?AUusageAttribute@helper_attributes@__vc_attributes@@` ALL-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1041 ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name @@ -407,13 +390,11 @@ ALL-NEXT: - LF_ENUMERATE [eBoolean = 0] ALL-NEXT: - LF_ENUMERATE [eInteger = 1] ALL-NEXT: - LF_ENUMERATE [eFloat = 2] ALL-NEXT: - LF_ENUMERATE [eDouble = 3] -ALL-NEXT: 0x1044 | LF_ENUM [size = 148, hash = 142625] -ALL-NEXT: name: `__vc_attributes::helper_attributes::v1_alttypeAttribute::type_e` +ALL-NEXT: 0x1044 | LF_ENUM [size = 148, hash = 142625] `__vc_attributes::helper_attributes::v1_alttypeAttribute::type_e` ALL-NEXT: unique name: `.?AW4type_e@v1_alttypeAttribute@helper_attributes@__vc_attributes@@` ALL-NEXT: field list: 0x1043, underlying type: 0x0074 (int) ALL-NEXT: options: has unique name | is nested -ALL-NEXT: 0x1045 | LF_STRUCTURE [size = 140, hash = 52534] -ALL-NEXT: class name: `__vc_attributes::helper_attributes::v1_alttypeAttribute` +ALL-NEXT: 0x1045 | LF_STRUCTURE [size = 140, hash = 52534] `__vc_attributes::helper_attributes::v1_alttypeAttribute` ALL-NEXT: unique name: `.?AUv1_alttypeAttribute@helper_attributes@__vc_attributes@@` ALL-NEXT: vtable: <no type>, base list: <no type>, field list: <no type> ALL-NEXT: options: forward ref | has unique name @@ -430,8 +411,7 @@ ALL-NEXT: - LF_NESTTYPE [name = `type_e`, parent = 0x1044] ALL-NEXT: - LF_ONEMETHOD [name = `v1_alttypeAttribute`] ALL-NEXT: type = 0x1048, vftable offset = -1, attrs = public ALL-NEXT: - LF_MEMBER [name = `type`, Type = 0x1044, offset = 0, attrs = public] -ALL-NEXT: 0x104A | LF_STRUCTURE [size = 140, hash = 213215] -ALL-NEXT: class name: `__vc_attributes::helper_attributes::v1_alttypeAttribute` +ALL-NEXT: 0x104A | LF_STRUCTURE [size = 140, hash = 213215] `__vc_attributes::helper_attributes::v1_alttypeAttribute` ALL-NEXT: unique name: `.?AUv1_alttypeAttribute@helper_attributes@__vc_attributes@@` ALL-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1049 ALL-NEXT: options: has ctor / dtor | contains nested class | has unique name @@ -590,147 +570,195 @@ BIG-NEXT: ============================================================ BIG-NEXT: Mod 0000 | Name: `D:\src\llvm\test\tools\llvm-symbolizer\pdb\Inputs\test.obj`: BIG-NEXT: Obj: `D:\src\llvm\test\tools\llvm-symbolizer\pdb\Inputs\test.obj`: BIG-NEXT: debug stream: 12, # files: 1, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0001 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\_cpu_disp_.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 14, # files: 14, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0002 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\_initsect_.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 15, # files: 19, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0003 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\_sehprolg4_.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 16, # files: 1, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 1 `f:\dd\vctools\crt\vcstartup\src\eh\i386\sehprolg4.asm` BIG-NEXT: Mod 0004 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\_chandler4gs_.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 17, # files: 14, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0005 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\_secchk_.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 18, # files: 14, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0006 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\gs_cookie.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 19, # files: 9, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0007 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\gs_report.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 20, # files: 14, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0008 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\gs_support.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 21, # files: 10, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0009 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\checkcfg.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 22, # files: 14, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0010 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\guard_support.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 23, # files: 10, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0011 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\loadcfg.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 24, # files: 9, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0012 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\dyn_tls_dtor.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 25, # files: 11, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0013 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\dyn_tls_init.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 26, # files: 10, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0014 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\matherr_detection.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 27, # files: 1, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0015 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\ucrt_detection.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 28, # files: 1, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0016 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\argv_mode.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 29, # files: 1, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0017 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\commit_mode.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 30, # files: 1, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0018 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\default_local_stdio_options.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 31, # files: 24, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0019 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\denormal_control.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 32, # files: 1, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0020 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\env_mode.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 33, # files: 1, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0021 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\file_mode.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 34, # files: 1, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0022 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\invalid_parameter_handler.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 35, # files: 1, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0023 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\matherr.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 36, # files: 2, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0024 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\new_mode.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 37, # files: 1, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0025 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\thread_locale.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 38, # files: 1, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0026 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\tncleanup.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 39, # files: 21, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0027 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\exe_main.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 40, # files: 26, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0028 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\initializers.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 41, # files: 20, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0029 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\utility.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 42, # files: 20, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0030 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\ucrt_stubs.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 43, # files: 1, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0031 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\utility_desktop.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 44, # files: 20, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0032 | Name: `f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\default_precision.obj`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\MSVCRT.lib`: BIG-NEXT: debug stream: 45, # files: 20, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0033 | Name: `Import:KERNEL32.dll`: BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\um\x86\kernel32.lib`: BIG-NEXT: debug stream: 47, # files: 0, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0034 | Name: `KERNEL32.dll`: BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\um\x86\kernel32.lib`: BIG-NEXT: debug stream: 46, # files: 0, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0035 | Name: `Import:VCRUNTIME140.dll`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\vcruntime.lib`: BIG-NEXT: debug stream: 49, # files: 0, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0036 | Name: `VCRUNTIME140.dll`: BIG-NEXT: Obj: `C:\PROGRA~2\MI0E91~1.0\VC\LIB\vcruntime.lib`: BIG-NEXT: debug stream: 48, # files: 0, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0037 | Name: `Import:api-ms-win-crt-stdio-l1-1-0.dll`: BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`: BIG-NEXT: debug stream: 59, # files: 0, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0038 | Name: `api-ms-win-crt-stdio-l1-1-0.dll`: BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`: BIG-NEXT: debug stream: 58, # files: 0, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0039 | Name: `Import:api-ms-win-crt-runtime-l1-1-0.dll`: BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`: BIG-NEXT: debug stream: 57, # files: 0, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0040 | Name: `api-ms-win-crt-runtime-l1-1-0.dll`: BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`: BIG-NEXT: debug stream: 56, # files: 0, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0041 | Name: `Import:api-ms-win-crt-math-l1-1-0.dll`: BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`: BIG-NEXT: debug stream: 55, # files: 0, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0042 | Name: `api-ms-win-crt-math-l1-1-0.dll`: BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`: BIG-NEXT: debug stream: 54, # files: 0, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0043 | Name: `Import:api-ms-win-crt-locale-l1-1-0.dll`: BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`: BIG-NEXT: debug stream: 53, # files: 0, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0044 | Name: `api-ms-win-crt-locale-l1-1-0.dll`: BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`: BIG-NEXT: debug stream: 52, # files: 0, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0045 | Name: `Import:api-ms-win-crt-heap-l1-1-0.dll`: BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`: BIG-NEXT: debug stream: 51, # files: 0, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0046 | Name: `api-ms-win-crt-heap-l1-1-0.dll`: BIG-NEXT: Obj: `C:\PROGRA~2\WI3CF2~1\10\Lib\10.0.10586.0\ucrt\x86\ucrt.lib`: BIG-NEXT: debug stream: 50, # files: 0, has ec info: false +BIG-NEXT: pdb file ni: 0 ``, src file ni: 0 `` BIG-NEXT: Mod 0047 | Name: `* Linker *`: BIG-NEXT: Obj: ``: BIG-NEXT: debug stream: 60, # files: 0, has ec info: false +BIG-NEXT: pdb file ni: 55 `{{.*test.pdb}}`, src file ni: 0 `` BIG: Files BIG-NEXT: ============================================================ BIG-NEXT: Mod 0000 | `D:\src\llvm\test\tools\llvm-symbolizer\pdb\Inputs\test.obj`: diff --git a/test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test b/test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test index 3903c07b027fb..dd4c072fe0c94 100644 --- a/test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test +++ b/test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test @@ -14,8 +14,7 @@ TPI-TYPES-NEXT: - LF_MEMBER [name = `FooMember`, Type = 0x0403 (void* TPI-TYPES-NEXT: 0x1002 | LF_ARGLIST [size = 16] TPI-TYPES-NEXT: 0x0074 (int): `int` TPI-TYPES-NEXT: 0x1000: `char**` -TPI-TYPES-NEXT: 0x1003 | LF_STRUCTURE [size = 36] -TPI-TYPES-NEXT: class name: `FooBar` +TPI-TYPES-NEXT: 0x1003 | LF_STRUCTURE [size = 36] `FooBar` TPI-TYPES-NEXT: unique name: `FooBar` TPI-TYPES-NEXT: vtable: <no type>, base list: <no type>, field list: 0x1001 TPI-TYPES-NEXT: options: has unique name diff --git a/test/DebugInfo/PDB/pdbdump-mergetypes.test b/test/DebugInfo/PDB/pdbdump-mergetypes.test index 8ab64cfab5163..60cf4a172aa2a 100644 --- a/test/DebugInfo/PDB/pdbdump-mergetypes.test +++ b/test/DebugInfo/PDB/pdbdump-mergetypes.test @@ -11,8 +11,7 @@ MERGED-NEXT: 0x1000 | LF_POINTER [size = 12] MERGED-NEXT: referent = 0x0075 (unsigned), mode = pointer, opts = None, kind = ptr32 MERGED-NEXT: 0x1001 | LF_POINTER [size = 12] MERGED-NEXT: referent = 0x0076 (__int64), mode = pointer, opts = None, kind = ptr32 -MERGED-NEXT: 0x1002 | LF_STRUCTURE [size = 48] -MERGED-NEXT: class name: `OnlyInMerge1` +MERGED-NEXT: 0x1002 | LF_STRUCTURE [size = 48] `OnlyInMerge1` MERGED-NEXT: unique name: `OnlyInMerge1` MERGED-NEXT: vtable: <no type>, base list: <no type>, field list: <no type> MERGED-NEXT: options: forward ref | has unique name @@ -29,8 +28,7 @@ MERGED-NEXT: 0x1003: `unsigned**` MERGED-NEXT: 0x1007 | LF_PROCEDURE [size = 16] MERGED-NEXT: return type = 0x0075 (unsigned), # args = 0, param list = 0x1006 MERGED-NEXT: calling conv = cdecl, options = None -MERGED-NEXT: 0x1008 | LF_STRUCTURE [size = 48] -MERGED-NEXT: class name: `OnlyInMerge2` +MERGED-NEXT: 0x1008 | LF_STRUCTURE [size = 48] `OnlyInMerge2` MERGED-NEXT: unique name: `OnlyInMerge2` MERGED-NEXT: vtable: <no type>, base list: <no type>, field list: <no type> MERGED-NEXT: options: forward ref | has unique name diff --git a/test/DebugInfo/X86/dbg-declare-inalloca.ll b/test/DebugInfo/X86/dbg-declare-inalloca.ll index e3f5c7e629b87..e8a310856c104 100644 --- a/test/DebugInfo/X86/dbg-declare-inalloca.ll +++ b/test/DebugInfo/X86/dbg-declare-inalloca.ll @@ -55,41 +55,41 @@ ; CHECK: .asciz "c" ; CHECK: .cv_def_range [[start]] [[end]] -; OBJ-LABEL: ProcStart { +; OBJ-LABEL: {{.*}}Proc{{.*}}Sym { ; OBJ: Kind: S_GPROC32_ID (0x1147) ; OBJ: DisplayName: f ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: Type: NonTrivial (0x1007) ; OBJ: Flags [ (0x1) ; OBJ: IsParameter (0x1) ; OBJ: ] ; OBJ: VarName: a ; OBJ: } -; OBJ: DefRangeRegisterRel { +; OBJ: DefRangeRegisterRelSym { ; OBJ: BaseRegister: 21 ; OBJ: BasePointerOffset: 12 ; OBJ: } -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: Type: int (0x74) ; OBJ: Flags [ (0x1) ; OBJ: IsParameter (0x1) ; OBJ: ] ; OBJ: VarName: b ; OBJ: } -; OBJ: DefRangeRegisterRel { +; OBJ: DefRangeRegisterRelSym { ; OBJ: BaseRegister: 21 ; OBJ: BasePointerOffset: 16 ; OBJ: } ; FIXME: Retain unused. -; OBJ: Local { +; OBJ: LocalSym { ; OBJ: Type: int (0x74) ; OBJ: Flags [ (0x1) ; OBJ: IsParameter (0x1) ; OBJ: ] ; OBJ: VarName: c ; OBJ: } -; OBJ: DefRangeRegisterRel { +; OBJ: DefRangeRegisterRelSym { ; OBJ: BaseRegister: 21 ; OBJ: BasePointerOffset: 24 ; OBJ: } diff --git a/test/DebugInfo/dwarfdump-str-offsets.test b/test/DebugInfo/dwarfdump-str-offsets.test index 0465357ba32a6..c09135580fe62 100644 --- a/test/DebugInfo/dwarfdump-str-offsets.test +++ b/test/DebugInfo/dwarfdump-str-offsets.test @@ -1,92 +1,94 @@ -RUN: llvm-dwarfdump %p/Inputs/dwarfdump-str-offsets.x86_64.o | FileCheck %s +RUN: llvm-dwarfdump %p/Inputs/dwarfdump-str-offsets.x86_64.o | FileCheck --check-prefix=COMMON \ +RUN: --check-prefix=SPLIT %s +RUN: llvm-dwarfdump %p/Inputs/dwarfdump-str-offsets-macho.o | FileCheck --check-prefix=COMMON %s ; We are using a hand-constructed object file and are interest in the correct ; diplay of the DW_str_offsetsbase attribute, the correct display of strings ; and the dump of the .debug_str_offsets[.dwo] table. ; ; Abbreviation for DW_AT_str_offsets_base -CHECK: .debug_abbrev contents: -CHECK-NOT: contents: -CHECK: DW_TAG_compile_unit -CHECK-NOT: DW_TAG -CHECK: DW_AT_str_offsets_base DW_FORM_sec_offset +COMMON: .debug_abbrev contents: +COMMON-NOT: contents: +COMMON: DW_TAG_compile_unit +COMMON-NOT: DW_TAG +COMMON: DW_AT_str_offsets_base DW_FORM_sec_offset ; Verify that strings are displayed correctly as indexed strings -CHECK: .debug_info contents: -CHECK-NOT: contents: -CHECK: DW_TAG_compile_unit -CHECK-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer") -CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_1") -CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008) -CHECK-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU1") -CHECK-NOT: NULL -CHECK: DW_TAG_subprogram -CHECK-NEXT: DW_AT_name [DW_FORM_strx1] ( indexed (00000003) string = "MyFunc") -CHECK-NOT: NULL -CHECK: DW_TAG_variable -CHECK-NEXT: DW_AT_name [DW_FORM_strx2] ( indexed (00000004) string = "MyVar1") -CHECK-NOT: NULL -CHECK: DW_TAG_variable -CHECK-NEXT: DW_AT_name [DW_FORM_strx3] ( indexed (00000005) string = "MyVar2") -CHECK-NOT: NULL -CHECK: DW_TAG_variable -CHECK-NEXT: DW_AT_name [DW_FORM_strx4] ( indexed (00000006) string = "MyVar3") +COMMON: .debug_info contents: +COMMON-NOT: contents: +COMMON: DW_TAG_compile_unit +COMMON-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer") +COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_1") +COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008) +COMMON-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU1") +COMMON-NOT: NULL +COMMON: DW_TAG_subprogram +COMMON-NEXT: DW_AT_name [DW_FORM_strx1] ( indexed (00000003) string = "MyFunc") +COMMON-NOT: NULL +COMMON: DW_TAG_variable +COMMON-NEXT: DW_AT_name [DW_FORM_strx2] ( indexed (00000004) string = "MyVar1") +COMMON-NOT: NULL +COMMON: DW_TAG_variable +COMMON-NEXT: DW_AT_name [DW_FORM_strx3] ( indexed (00000005) string = "MyVar2") +COMMON-NOT: NULL +COMMON: DW_TAG_variable +COMMON-NEXT: DW_AT_name [DW_FORM_strx4] ( indexed (00000006) string = "MyVar3") ; Second compile unit (b.cpp) -CHECK: DW_TAG_compile_unit -CHECK-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer") -CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_2") -CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000002c) -CHECK-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU2") +COMMON: DW_TAG_compile_unit +COMMON-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer") +COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_2") +COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000002c) +COMMON-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU2") ; The split CU -CHECK: .debug_info.dwo contents: -CHECK-NOT: contents: -CHECK: DW_TAG_compile_unit -CHECK-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade split DWARF producer") -CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "V5_split_compile_unit") -CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008) -CHECK-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/splitCU") +SPLIT: .debug_info.dwo contents: +SPLIT-NOT: contents: +SPLIT: DW_TAG_compile_unit +SPLIT-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade split DWARF producer") +SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "V5_split_compile_unit") +SPLIT-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008) +SPLIT-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/splitCU") ; The type unit -CHECK: .debug_types contents: -CHECK: DW_TAG_type_unit -CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit") -CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000040) -CHECK: DW_TAG_structure_type -CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct") +COMMON: .debug_types contents: +COMMON: DW_TAG_type_unit +COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit") +COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000040) +COMMON: DW_TAG_structure_type +COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct") ; The split type unit -CHECK: .debug_types.dwo contents: -CHECK: DW_TAG_type_unit -CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "V5_split_type_unit") -CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000001c) -CHECK: DW_TAG_structure_type -CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "V5_split_Mystruct") +SPLIT: .debug_types.dwo contents: +SPLIT: DW_TAG_type_unit +SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "V5_split_type_unit") +SPLIT-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000001c) +SPLIT: DW_TAG_structure_type +SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "V5_split_Mystruct") ; The .debug_str_offsets section -CHECK: .debug_str_offsets contents: -CHECK-NEXT: 0x00000000: Contribution size = 28, Version = 5 -CHECK-NEXT: 0x00000008: 00000000 "Handmade DWARF producer" -CHECK-NEXT: 0x0000000c: 00000018 "Compile_Unit_1" -CHECK-NEXT: 0x00000010: 00000027 "/home/test/CU1" -CHECK-NEXT: 0x00000014: 00000067 "MyFunc" -CHECK-NEXT: 0x00000018: 0000006e "MyVar1" -CHECK-NEXT: 0x0000001c: 00000075 "MyVar2" -CHECK-NEXT: 0x00000020: 0000007c "MyVar3" -CHECK-NEXT: 0x00000024: Contribution size = 12, Version = 5 -CHECK-NEXT: 0x0000002c: 00000000 "Handmade DWARF producer" -CHECK-NEXT: 0x00000030: 00000036 "Compile_Unit_2" -CHECK-NEXT: 0x00000034: 00000045 "/home/test/CU2" -CHECK-NEXT: 0x00000038: Contribution size = 8, Version = 5 -CHECK-NEXT: 0x00000040: 00000054 "Type_Unit" -CHECK-NEXT: 0x00000044: 0000005e "MyStruct" +COMMON: .debug_str_offsets contents: +COMMON-NEXT: 0x00000000: Contribution size = 28, Version = 5 +COMMON-NEXT: 0x00000008: 00000000 "Handmade DWARF producer" +COMMON-NEXT: 0x0000000c: 00000018 "Compile_Unit_1" +COMMON-NEXT: 0x00000010: 00000027 "/home/test/CU1" +COMMON-NEXT: 0x00000014: 00000067 "MyFunc" +COMMON-NEXT: 0x00000018: 0000006e "MyVar1" +COMMON-NEXT: 0x0000001c: 00000075 "MyVar2" +COMMON-NEXT: 0x00000020: 0000007c "MyVar3" +COMMON-NEXT: 0x00000024: Contribution size = 12, Version = 5 +COMMON-NEXT: 0x0000002c: 00000000 "Handmade DWARF producer" +COMMON-NEXT: 0x00000030: 00000036 "Compile_Unit_2" +COMMON-NEXT: 0x00000034: 00000045 "/home/test/CU2" +COMMON-NEXT: 0x00000038: Contribution size = 8, Version = 5 +COMMON-NEXT: 0x00000040: 00000054 "Type_Unit" +COMMON-NEXT: 0x00000044: 0000005e "MyStruct" -CHECK: .debug_str_offsets.dwo contents: -CHECK-NEXT: 0x00000000: Contribution size = 12, Version = 5 -CHECK-NEXT: 0x00000008: 00000000 "Handmade split DWARF producer" -CHECK-NEXT: 0x0000000c: 0000001e "V5_split_compile_unit" -CHECK-NEXT: 0x00000010: 00000034 "/home/test/splitCU" -CHECK-NEXT: 0x00000014: Contribution size = 8, Version = 5 -CHECK-NEXT: 0x0000001c: 00000047 "V5_split_type_unit" -CHECK-NEXT: 0x00000020: 0000005a "V5_split_Mystruct" +SPLIT: .debug_str_offsets.dwo contents: +SPLIT-NEXT: 0x00000000: Contribution size = 12, Version = 5 +SPLIT-NEXT: 0x00000008: 00000000 "Handmade split DWARF producer" +SPLIT-NEXT: 0x0000000c: 0000001e "V5_split_compile_unit" +SPLIT-NEXT: 0x00000010: 00000034 "/home/test/splitCU" +SPLIT-NEXT: 0x00000014: Contribution size = 8, Version = 5 +SPLIT-NEXT: 0x0000001c: 00000047 "V5_split_type_unit" +SPLIT-NEXT: 0x00000020: 0000005a "V5_split_Mystruct" diff --git a/test/DebugInfo/invalid-relocations.test b/test/DebugInfo/invalid-relocations.test new file mode 100644 index 0000000000000..2252e1a205c3d --- /dev/null +++ b/test/DebugInfo/invalid-relocations.test @@ -0,0 +1,35 @@ +# RUN: yaml2obj %s > %t.o +# RUN: llvm-dwarfdump %t.o 2>&1 | FileCheck %s +# CHECK: failed to compute relocation: Unknown + +!ELF +FileHeader: + Class: ELFCLASS32 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_386 +Sections: + - Type: SHT_PROGBITS + Name: .text + Flags: [ ] + AddressAlign: 0x04 + Content: "0000" + - Type: SHT_PROGBITS + Name: .debug_info + Flags: [ ] + AddressAlign: 0x04 + Content: "0000" + - Type: SHT_REL + Name: .rel.debug_info + Link: .symtab + Info: .debug_info + Relocations: + - Offset: 0 + Symbol: _start + Type: 0xFF +Symbols: + Global: + - Name: _start + Type: STT_FUNC + Section: .text + Value: 0x0 diff --git a/test/DebugInfo/llvm-symbolizer.test b/test/DebugInfo/llvm-symbolizer.test index 2c64804659fce..bcad37cf9a489 100644 --- a/test/DebugInfo/llvm-symbolizer.test +++ b/test/DebugInfo/llvm-symbolizer.test @@ -10,9 +10,10 @@ RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0x8dc" >> %t.input RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0xa05" >> %t.input RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0x987" >> %t.input RUN: echo "%p/Inputs/dwarfdump-inl-test.high_pc.elf-x86-64 0x568" >> %t.input -RUN: echo "\"%p/Inputs/dwarfdump-test3.elf-x86-64 space\" 0x640" >> %t.input -RUN: echo "\"%p/Inputs/dwarfdump-test3.elf-x86-64 space\" 0x633" >> %t.input -RUN: echo "\"%p/Inputs/dwarfdump-test3.elf-x86-64 space\" 0x62d" >> %t.input +RUN: cp "%p/Inputs/dwarfdump-test3.elf-x86-64-space" "%T/dwarfdump-test3.elf-x86-64 space" +RUN: echo "\"%T/dwarfdump-test3.elf-x86-64 space\" 0x640" >> %t.input +RUN: echo "\"%T/dwarfdump-test3.elf-x86-64 space\" 0x633" >> %t.input +RUN: echo "\"%T/dwarfdump-test3.elf-x86-64 space\" 0x62d" >> %t.input RUN: echo "%p/Inputs/macho-universal 0x1f84" >> %t.input RUN: echo "%p/Inputs/macho-universal:i386 0x1f67" >> %t.input RUN: echo "%p/Inputs/macho-universal:x86_64 0x100000f05" >> %t.input diff --git a/test/Instrumentation/MemorySanitizer/unsized_type.ll b/test/Instrumentation/MemorySanitizer/unsized_type.ll new file mode 100644 index 0000000000000..94ae92d3354a4 --- /dev/null +++ b/test/Instrumentation/MemorySanitizer/unsized_type.ll @@ -0,0 +1,22 @@ +; Check that unsized token types used by coroutine intrinsics do not cause +; assertion failures. +; RUN: opt < %s -msan -S 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*) +declare i1 @llvm.coro.alloc(token) + +define void @foo() sanitize_memory { +entry: + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) + %dyn.alloc.reqd = call i1 @llvm.coro.alloc(token %id) + ret void +} + +; CHECK: define void @foo +; CHECK-NEXT: entry: +; CHECK-NEXT: %id = call token @llvm.coro.id +; CHECK-NEXT: call i1 @llvm.coro.alloc(token %id) +; CHECK-NEXT: ret void diff --git a/test/Instrumentation/ThreadSanitizer/atomic.ll b/test/Instrumentation/ThreadSanitizer/atomic.ll index 30c58fea4cb7e..3d83d9299e664 100644 --- a/test/Instrumentation/ThreadSanitizer/atomic.ll +++ b/test/Instrumentation/ThreadSanitizer/atomic.ll @@ -1959,7 +1959,7 @@ entry: define void @atomic_signal_fence_acquire() nounwind uwtable { entry: - fence singlethread acquire, !dbg !7 + fence syncscope("singlethread") acquire, !dbg !7 ret void, !dbg !7 } ; CHECK-LABEL: atomic_signal_fence_acquire @@ -1975,7 +1975,7 @@ entry: define void @atomic_signal_fence_release() nounwind uwtable { entry: - fence singlethread release, !dbg !7 + fence syncscope("singlethread") release, !dbg !7 ret void, !dbg !7 } ; CHECK-LABEL: atomic_signal_fence_release @@ -1991,7 +1991,7 @@ entry: define void @atomic_signal_fence_acq_rel() nounwind uwtable { entry: - fence singlethread acq_rel, !dbg !7 + fence syncscope("singlethread") acq_rel, !dbg !7 ret void, !dbg !7 } ; CHECK-LABEL: atomic_signal_fence_acq_rel @@ -2007,7 +2007,7 @@ entry: define void @atomic_signal_fence_seq_cst() nounwind uwtable { entry: - fence singlethread seq_cst, !dbg !7 + fence syncscope("singlethread") seq_cst, !dbg !7 ret void, !dbg !7 } ; CHECK-LABEL: atomic_signal_fence_seq_cst diff --git a/test/LTO/Resolution/X86/linker-redef-thin.ll b/test/LTO/Resolution/X86/linker-redef-thin.ll new file mode 100644 index 0000000000000..ebaac8094e75a --- /dev/null +++ b/test/LTO/Resolution/X86/linker-redef-thin.ll @@ -0,0 +1,16 @@ +; RUN: opt -module-summary %s -o %t.o +; RUN: llvm-lto2 run -o %t1.o %t.o -r %t.o,patatino,pr +; RUN: llvm-readobj -t %t1.o.0 | FileCheck %s + +; CHECK: Name: patatino +; CHECK-NEXT: Value: +; CHECK-NEXT: Size: +; CHECK-NEXT: Binding: Weak +; CHECK-NEXT: Type: Function + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @patatino() { + ret void +} diff --git a/test/Linker/Inputs/syncscope-1.ll b/test/Linker/Inputs/syncscope-1.ll new file mode 100644 index 0000000000000..90578e931dd53 --- /dev/null +++ b/test/Linker/Inputs/syncscope-1.ll @@ -0,0 +1,6 @@ +define void @syncscope_1() { + fence syncscope("agent") seq_cst + fence syncscope("workgroup") seq_cst + fence syncscope("wavefront") seq_cst + ret void +} diff --git a/test/Linker/Inputs/syncscope-2.ll b/test/Linker/Inputs/syncscope-2.ll new file mode 100644 index 0000000000000..527c5bf93d005 --- /dev/null +++ b/test/Linker/Inputs/syncscope-2.ll @@ -0,0 +1,6 @@ +define void @syncscope_2() { + fence syncscope("image") seq_cst + fence syncscope("agent") seq_cst + fence syncscope("workgroup") seq_cst + ret void +} diff --git a/test/Linker/Inputs/thumb-module-inline-asm.ll b/test/Linker/Inputs/thumb-module-inline-asm.ll new file mode 100644 index 0000000000000..7792ff96d5b57 --- /dev/null +++ b/test/Linker/Inputs/thumb-module-inline-asm.ll @@ -0,0 +1,3 @@ +target triple = "thumbv7-linux-gnueabihf" + +module asm "orn r1, r2, r2" diff --git a/test/Linker/link-arm-and-thumb-module-inline-asm.ll b/test/Linker/link-arm-and-thumb-module-inline-asm.ll new file mode 100644 index 0000000000000..13779f37ffa0e --- /dev/null +++ b/test/Linker/link-arm-and-thumb-module-inline-asm.ll @@ -0,0 +1,20 @@ +; This test checks that proper directives to switch between ARM and Thumb mode +; are added when linking ARM and Thumb modules. + +; RUN: llvm-as %s -o %t1.bc +; RUN: llvm-as %p/Inputs/thumb-module-inline-asm.ll -o %t2.bc +; RUN: llvm-link %t1.bc %t2.bc -S 2> %t3.out | FileCheck %s + +target triple = "armv7-linux-gnueabihf" + +module asm "add r1, r2, r2" + +; CHECK: .text +; CHECK-NEXT: .balign 4 +; CHECK-NEXT: .arm +; CHECK-NEXT: add r1, r2, r2 +; CHECK-NEXT: module asm +; CHECK-NEXT: .text +; CHECK-NEXT: .balign 2 +; CHECK-NEXT: .thumb +; CHECK-NEXT: orn r1, r2, r2 diff --git a/test/Linker/syncscopes.ll b/test/Linker/syncscopes.ll new file mode 100644 index 0000000000000..a572c23cffbdb --- /dev/null +++ b/test/Linker/syncscopes.ll @@ -0,0 +1,11 @@ +; RUN: llvm-link %S/Inputs/syncscope-1.ll %S/Inputs/syncscope-2.ll -S | FileCheck %s + +; CHECK-LABEL: define void @syncscope_1 +; CHECK: fence syncscope("agent") seq_cst +; CHECK: fence syncscope("workgroup") seq_cst +; CHECK: fence syncscope("wavefront") seq_cst + +; CHECK-LABEL: define void @syncscope_2 +; CHECK: fence syncscope("image") seq_cst +; CHECK: fence syncscope("agent") seq_cst +; CHECK: fence syncscope("workgroup") seq_cst diff --git a/test/MC/AArch64/label-arithmetic-diags-elf.s b/test/MC/AArch64/label-arithmetic-diags-elf.s index dbfdd24f8dc91..2ef67fafb2ea5 100644 --- a/test/MC/AArch64/label-arithmetic-diags-elf.s +++ b/test/MC/AArch64/label-arithmetic-diags-elf.s @@ -5,7 +5,7 @@ b: .fill 300 e: .byte e - b - // CHECK: error: value evaluated as 300 is out of range. + // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: value evaluated as 300 is out of range. // CHECK-NEXT: .byte e - b // CHECK-NEXT: ^ @@ -14,67 +14,74 @@ start: .space 5000 end: add w0, w1, #(end - start) - cmp w0, #(end - start) - // CHECK: error: fixup value out of range + // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: fixup value out of range // CHECK-NEXT: add w0, w1, #(end - start) // CHECK-NEXT: ^ - // CHECK: error: fixup value out of range + + cmp w0, #(end - start) + // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: fixup value out of range // CHECK-NEXT: cmp w0, #(end - start) // CHECK-NEXT: ^ negative: add w0, w1, #(end - negative) - cmp w0, #(end - negative) - // CHECK: error: fixup value out of range + // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: fixup value out of range // CHECK-NEXT: add w0, w1, #(end - negative) // CHECK-NEXT: ^ - // CHECK: error: fixup value out of range + + cmp w0, #(end - negative) + // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: fixup value out of range // CHECK-NEXT: cmp w0, #(end - negative) // CHECK-NEXT: ^ add w0, w1, #(end - external) - cmp w0, #(end - external) - // CHECK: error: symbol 'external' can not be undefined in a subtraction expression + // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: symbol 'external' can not be undefined in a subtraction expression // CHECK-NEXT: add w0, w1, #(end - external) // CHECK-NEXT: ^ - // CHECK: error: symbol 'external' can not be undefined in a subtraction expression + + cmp w0, #(end - external) + // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: symbol 'external' can not be undefined in a subtraction expression // CHECK-NEXT: cmp w0, #(end - external) // CHECK-NEXT: ^ add w0, w1, #:lo12:external - end - cmp w0, #:lo12:external - end - // CHECK: error: Unsupported pc-relative fixup kind + // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Unsupported pc-relative fixup kind // CHECK-NEXT: add w0, w1, #:lo12:external - end // CHECK-NEXT: ^ - // CHECK: error: Unsupported pc-relative fixup kind + + cmp w0, #:lo12:external - end + // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Unsupported pc-relative fixup kind // CHECK-NEXT: cmp w0, #:lo12:external - end // CHECK-NEXT: ^ add w0, w1, #:got_lo12:external - end - cmp w0, #:got_lo12:external - end - // CHECK: error: Unsupported pc-relative fixup kind + // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Unsupported pc-relative fixup kind // CHECK-NEXT: add w0, w1, #:got_lo12:external - end // CHECK-NEXT: ^ - // CHECK: error: Unsupported pc-relative fixup kind + + cmp w0, #:got_lo12:external - end + // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Unsupported pc-relative fixup kind // CHECK-NEXT: cmp w0, #:got_lo12:external - end // CHECK-NEXT: ^ .section sec_y end_across_sec: add w0, w1, #(end_across_sec - start) - cmp w0, #(end_across_sec - start) - // CHECK: error: Cannot represent a difference across sections + // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Cannot represent a difference across sections // CHECK-NEXT: add w0, w1, #(end_across_sec - start) // CHECK-NEXT: ^ - // CHECK: error: Cannot represent a difference across sections + + cmp w0, #(end_across_sec - start) + // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Cannot represent a difference across sections // CHECK-NEXT: cmp w0, #(end_across_sec - start) // CHECK-NEXT: ^ add w0, w1, #(sec_y - sec_x) - cmp w0, #(sec_y - sec_x) - // CHECK: error: Cannot represent a difference across sections + // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Cannot represent a difference across sections // CHECK-NEXT: add w0, w1, #(sec_y - sec_x) // CHECK-NEXT: ^ - // CHECK: error: Cannot represent a difference across sections + + cmp w0, #(sec_y - sec_x) + // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: Cannot represent a difference across sections // CHECK-NEXT: cmp w0, #(sec_y - sec_x) // CHECK-NEXT: ^ diff --git a/test/MC/AMDGPU/gfx9_asm_all.s b/test/MC/AMDGPU/gfx9_asm_all.s index 0c3dbd221a49e..56484a37bdcea 100644 --- a/test/MC/AMDGPU/gfx9_asm_all.s +++ b/test/MC/AMDGPU/gfx9_asm_all.s @@ -104933,3 +104933,462 @@ v_cmpx_t_u32_sdwa s[6:7], v1, v2 src0_sel:DWORD src1_sel:WORD_1 v_cmpx_t_u32_sdwa s[6:7], v1, sext(v2) src0_sel:DWORD src1_sel:DWORD // CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x86,0x06,0x0e] + +v_mad_mix_f32 v5, v1, v2, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v255, v1, v2, v3 +// CHECK: [0xff,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v5, v255, v2, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0xff,0x05,0x0e,0x1c] + +v_mad_mix_f32 v5, s1, v2, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x04,0x0e,0x1c] + +v_mad_mix_f32 v5, s101, v2, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x65,0x04,0x0e,0x1c] + +v_mad_mix_f32 v5, flat_scratch_lo, v2, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x66,0x04,0x0e,0x1c] + +v_mad_mix_f32 v5, flat_scratch_hi, v2, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x67,0x04,0x0e,0x1c] + +v_mad_mix_f32 v5, vcc_lo, v2, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x6a,0x04,0x0e,0x1c] + +v_mad_mix_f32 v5, vcc_hi, v2, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x6b,0x04,0x0e,0x1c] + +v_mad_mix_f32 v5, m0, v2, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x7c,0x04,0x0e,0x1c] + +v_mad_mix_f32 v5, exec_lo, v2, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x7e,0x04,0x0e,0x1c] + +v_mad_mix_f32 v5, exec_hi, v2, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x7f,0x04,0x0e,0x1c] + +v_mad_mix_f32 v5, v1, v255, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xff,0x0f,0x1c] + +v_mad_mix_f32 v5, v1, s2, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0c,0x1c] + +v_mad_mix_f32 v5, v1, s101, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xcb,0x0c,0x1c] + +v_mad_mix_f32 v5, v1, flat_scratch_lo, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xcd,0x0c,0x1c] + +v_mad_mix_f32 v5, v1, flat_scratch_hi, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xcf,0x0c,0x1c] + +v_mad_mix_f32 v5, v1, vcc_lo, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xd5,0x0c,0x1c] + +v_mad_mix_f32 v5, v1, vcc_hi, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xd7,0x0c,0x1c] + +v_mad_mix_f32 v5, v1, m0, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xf9,0x0c,0x1c] + +v_mad_mix_f32 v5, v1, exec_lo, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xfd,0x0c,0x1c] + +v_mad_mix_f32 v5, v1, exec_hi, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0xff,0x0c,0x1c] + +v_mad_mix_f32 v5, v1, v2, v255 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0xfe,0x1f] + +v_mad_mix_f32 v5, v1, v2, s3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x18] + +v_mad_mix_f32 v5, v1, v2, s101 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x96,0x19] + +v_mad_mix_f32 v5, v1, v2, flat_scratch_lo +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x9a,0x19] + +v_mad_mix_f32 v5, v1, v2, flat_scratch_hi +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x9e,0x19] + +v_mad_mix_f32 v5, v1, v2, vcc_lo +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0xaa,0x19] + +v_mad_mix_f32 v5, v1, v2, vcc_hi +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0xae,0x19] + +v_mad_mix_f32 v5, v1, v2, m0 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0xf2,0x19] + +v_mad_mix_f32 v5, v1, v2, exec_lo +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0xfa,0x19] + +v_mad_mix_f32 v5, v1, v2, exec_hi +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0xfe,0x19] + +v_mad_mix_f32 v5, v1, v2, v3 op_sel:[0,0,0] +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v5, v1, v2, v3 op_sel:[1,0,0] +// CHECK: [0x05,0x48,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v5, v1, v2, v3 op_sel:[0,1,0] +// CHECK: [0x05,0x50,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v5, v1, v2, v3 op_sel:[0,0,1] +// CHECK: [0x05,0x60,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v5, v1, v2, v3 op_sel:[1,1,1] +// CHECK: [0x05,0x78,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v5, v1, v2, v3 op_sel_hi:[1,1,1] +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v5, v1, v2, v3 op_sel_hi:[0,0,0] +// CHECK: [0x05,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mix_f32 v5, v1, v2, v3 op_sel_hi:[1,0,0] +// CHECK: [0x05,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x0c] + +v_mad_mix_f32 v5, v1, v2, v3 op_sel_hi:[0,1,0] +// CHECK: [0x05,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x14] + +v_mad_mix_f32 v5, v1, v2, v3 op_sel_hi:[0,0,1] +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mix_f32 v5, -v1, v2, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x3c] + +v_mad_mix_f32 v5, v1, -v2, v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x5c] + +v_mad_mix_f32 v5, v1, v2, -v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x9c] + +v_mad_mix_f32 v5, -v1, -v2, -v3 +// CHECK: [0x05,0x40,0xa0,0xd3,0x01,0x05,0x0e,0xfc] + +v_mad_mix_f32 v5, |v1|, v2, v3 +// CHECK: [0x05,0x41,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v5, v1, |v2|, v3 +// CHECK: [0x05,0x42,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v5, v1, v2, |v3| +// CHECK: [0x05,0x44,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v5, |v1|, |v2|, |v3| +// CHECK: [0x05,0x47,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v5, v1, v2, v3 clamp +// CHECK: [0x05,0xc0,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixhi_f16 v5, v1, v2, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixhi_f16 v255, v1, v2, v3 +// CHECK: [0xff,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixhi_f16 v5, v255, v2, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0xff,0x05,0x0e,0x1c] + +v_mad_mixhi_f16 v5, s1, v2, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x04,0x0e,0x1c] + +v_mad_mixhi_f16 v5, s101, v2, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x65,0x04,0x0e,0x1c] + +v_mad_mixhi_f16 v5, flat_scratch_lo, v2, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x66,0x04,0x0e,0x1c] + +v_mad_mixhi_f16 v5, flat_scratch_hi, v2, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x67,0x04,0x0e,0x1c] + +v_mad_mixhi_f16 v5, vcc_lo, v2, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x6a,0x04,0x0e,0x1c] + +v_mad_mixhi_f16 v5, vcc_hi, v2, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x6b,0x04,0x0e,0x1c] + +v_mad_mixhi_f16 v5, m0, v2, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x7c,0x04,0x0e,0x1c] + +v_mad_mixhi_f16 v5, exec_lo, v2, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x7e,0x04,0x0e,0x1c] + +v_mad_mixhi_f16 v5, exec_hi, v2, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x7f,0x04,0x0e,0x1c] + +v_mad_mixhi_f16 v5, v1, v255, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xff,0x0f,0x1c] + +v_mad_mixhi_f16 v5, v1, s2, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0c,0x1c] + +v_mad_mixhi_f16 v5, v1, s101, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xcb,0x0c,0x1c] + +v_mad_mixhi_f16 v5, v1, flat_scratch_lo, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xcd,0x0c,0x1c] + +v_mad_mixhi_f16 v5, v1, flat_scratch_hi, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xcf,0x0c,0x1c] + +v_mad_mixhi_f16 v5, v1, vcc_lo, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xd5,0x0c,0x1c] + +v_mad_mixhi_f16 v5, v1, vcc_hi, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xd7,0x0c,0x1c] + +v_mad_mixhi_f16 v5, v1, m0, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xf9,0x0c,0x1c] + +v_mad_mixhi_f16 v5, v1, exec_lo, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xfd,0x0c,0x1c] + +v_mad_mixhi_f16 v5, v1, exec_hi, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0xff,0x0c,0x1c] + +v_mad_mixhi_f16 v5, v1, v2, v255 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0xfe,0x1f] + +v_mad_mixhi_f16 v5, v1, v2, s3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x18] + +v_mad_mixhi_f16 v5, v1, v2, s101 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x96,0x19] + +v_mad_mixhi_f16 v5, v1, v2, flat_scratch_lo +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x9a,0x19] + +v_mad_mixhi_f16 v5, v1, v2, flat_scratch_hi +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x9e,0x19] + +v_mad_mixhi_f16 v5, v1, v2, vcc_lo +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0xaa,0x19] + +v_mad_mixhi_f16 v5, v1, v2, vcc_hi +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0xae,0x19] + +v_mad_mixhi_f16 v5, v1, v2, m0 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0xf2,0x19] + +v_mad_mixhi_f16 v5, v1, v2, exec_lo +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0xfa,0x19] + +v_mad_mixhi_f16 v5, v1, v2, exec_hi +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0xfe,0x19] + +v_mad_mixhi_f16 v5, v1, v2, v3 op_sel:[0,0,0] +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixhi_f16 v5, v1, v2, v3 op_sel:[1,0,0] +// CHECK: [0x05,0x48,0xa2,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixhi_f16 v5, v1, v2, v3 op_sel:[0,1,0] +// CHECK: [0x05,0x50,0xa2,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixhi_f16 v5, v1, v2, v3 op_sel:[0,0,1] +// CHECK: [0x05,0x60,0xa2,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixhi_f16 v5, v1, v2, v3 op_sel:[1,1,1] +// CHECK: [0x05,0x78,0xa2,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixhi_f16 v5, v1, v2, v3 op_sel_hi:[1,1,1] +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixhi_f16 v5, v1, v2, v3 op_sel_hi:[0,0,0] +// CHECK: [0x05,0x00,0xa2,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mixhi_f16 v5, v1, v2, v3 op_sel_hi:[1,0,0] +// CHECK: [0x05,0x00,0xa2,0xd3,0x01,0x05,0x0e,0x0c] + +v_mad_mixhi_f16 v5, v1, v2, v3 op_sel_hi:[0,1,0] +// CHECK: [0x05,0x00,0xa2,0xd3,0x01,0x05,0x0e,0x14] + +v_mad_mixhi_f16 v5, v1, v2, v3 op_sel_hi:[0,0,1] +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mixhi_f16 v5, -v1, v2, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x3c] + +v_mad_mixhi_f16 v5, v1, -v2, v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x5c] + +v_mad_mixhi_f16 v5, v1, v2, -v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x9c] + +v_mad_mixhi_f16 v5, -v1, -v2, -v3 +// CHECK: [0x05,0x40,0xa2,0xd3,0x01,0x05,0x0e,0xfc] + +v_mad_mixhi_f16 v5, |v1|, v2, v3 +// CHECK: [0x05,0x41,0xa2,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixhi_f16 v5, v1, |v2|, v3 +// CHECK: [0x05,0x42,0xa2,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixhi_f16 v5, v1, v2, |v3| +// CHECK: [0x05,0x44,0xa2,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixhi_f16 v5, |v1|, |v2|, |v3| +// CHECK: [0x05,0x47,0xa2,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixhi_f16 v5, v1, v2, v3 clamp +// CHECK: [0x05,0xc0,0xa2,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixlo_f16 v5, v1, v2, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixlo_f16 v255, v1, v2, v3 +// CHECK: [0xff,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixlo_f16 v5, v255, v2, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0xff,0x05,0x0e,0x1c] + +v_mad_mixlo_f16 v5, s1, v2, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x04,0x0e,0x1c] + +v_mad_mixlo_f16 v5, s101, v2, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x65,0x04,0x0e,0x1c] + +v_mad_mixlo_f16 v5, flat_scratch_lo, v2, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x66,0x04,0x0e,0x1c] + +v_mad_mixlo_f16 v5, flat_scratch_hi, v2, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x67,0x04,0x0e,0x1c] + +v_mad_mixlo_f16 v5, vcc_lo, v2, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x6a,0x04,0x0e,0x1c] + +v_mad_mixlo_f16 v5, vcc_hi, v2, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x6b,0x04,0x0e,0x1c] + +v_mad_mixlo_f16 v5, m0, v2, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x7c,0x04,0x0e,0x1c] + +v_mad_mixlo_f16 v5, exec_lo, v2, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x7e,0x04,0x0e,0x1c] + +v_mad_mixlo_f16 v5, exec_hi, v2, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x7f,0x04,0x0e,0x1c] + +v_mad_mixlo_f16 v5, v1, v255, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xff,0x0f,0x1c] + +v_mad_mixlo_f16 v5, v1, s2, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0c,0x1c] + +v_mad_mixlo_f16 v5, v1, s101, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xcb,0x0c,0x1c] + +v_mad_mixlo_f16 v5, v1, flat_scratch_lo, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xcd,0x0c,0x1c] + +v_mad_mixlo_f16 v5, v1, flat_scratch_hi, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xcf,0x0c,0x1c] + +v_mad_mixlo_f16 v5, v1, vcc_lo, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xd5,0x0c,0x1c] + +v_mad_mixlo_f16 v5, v1, vcc_hi, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xd7,0x0c,0x1c] + +v_mad_mixlo_f16 v5, v1, m0, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xf9,0x0c,0x1c] + +v_mad_mixlo_f16 v5, v1, exec_lo, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xfd,0x0c,0x1c] + +v_mad_mixlo_f16 v5, v1, exec_hi, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0xff,0x0c,0x1c] + +v_mad_mixlo_f16 v5, v1, v2, v255 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0xfe,0x1f] + +v_mad_mixlo_f16 v5, v1, v2, s3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x18] + +v_mad_mixlo_f16 v5, v1, v2, s101 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x96,0x19] + +v_mad_mixlo_f16 v5, v1, v2, flat_scratch_lo +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x9a,0x19] + +v_mad_mixlo_f16 v5, v1, v2, flat_scratch_hi +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x9e,0x19] + +v_mad_mixlo_f16 v5, v1, v2, vcc_lo +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0xaa,0x19] + +v_mad_mixlo_f16 v5, v1, v2, vcc_hi +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0xae,0x19] + +v_mad_mixlo_f16 v5, v1, v2, m0 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0xf2,0x19] + +v_mad_mixlo_f16 v5, v1, v2, exec_lo +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0xfa,0x19] + +v_mad_mixlo_f16 v5, v1, v2, exec_hi +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0xfe,0x19] + +v_mad_mixlo_f16 v5, v1, v2, v3 op_sel:[0,0,0] +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixlo_f16 v5, v1, v2, v3 op_sel:[1,0,0] +// CHECK: [0x05,0x48,0xa1,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixlo_f16 v5, v1, v2, v3 op_sel:[0,1,0] +// CHECK: [0x05,0x50,0xa1,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixlo_f16 v5, v1, v2, v3 op_sel:[0,0,1] +// CHECK: [0x05,0x60,0xa1,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixlo_f16 v5, v1, v2, v3 op_sel:[1,1,1] +// CHECK: [0x05,0x78,0xa1,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixlo_f16 v5, v1, v2, v3 op_sel_hi:[1,1,1] +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixlo_f16 v5, v1, v2, v3 op_sel_hi:[0,0,0] +// CHECK: [0x05,0x00,0xa1,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mixlo_f16 v5, v1, v2, v3 op_sel_hi:[1,0,0] +// CHECK: [0x05,0x00,0xa1,0xd3,0x01,0x05,0x0e,0x0c] + +v_mad_mixlo_f16 v5, v1, v2, v3 op_sel_hi:[0,1,0] +// CHECK: [0x05,0x00,0xa1,0xd3,0x01,0x05,0x0e,0x14] + +v_mad_mixlo_f16 v5, v1, v2, v3 op_sel_hi:[0,0,1] +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mixlo_f16 v5, -v1, v2, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x3c] + +v_mad_mixlo_f16 v5, v1, -v2, v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x5c] + +v_mad_mixlo_f16 v5, v1, v2, -v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x9c] + +v_mad_mixlo_f16 v5, -v1, -v2, -v3 +// CHECK: [0x05,0x40,0xa1,0xd3,0x01,0x05,0x0e,0xfc] + +v_mad_mixlo_f16 v5, |v1|, v2, v3 +// CHECK: [0x05,0x41,0xa1,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixlo_f16 v5, v1, |v2|, v3 +// CHECK: [0x05,0x42,0xa1,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixlo_f16 v5, v1, v2, |v3| +// CHECK: [0x05,0x44,0xa1,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixlo_f16 v5, |v1|, |v2|, |v3| +// CHECK: [0x05,0x47,0xa1,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixlo_f16 v5, v1, v2, v3 clamp +// CHECK: [0x05,0xc0,0xa1,0xd3,0x01,0x05,0x0e,0x1c] diff --git a/test/MC/AMDGPU/vop3p-err.s b/test/MC/AMDGPU/vop3p-err.s index f4b1a3da714f8..bc6f6100f327d 100644 --- a/test/MC/AMDGPU/vop3p-err.s +++ b/test/MC/AMDGPU/vop3p-err.s @@ -71,47 +71,6 @@ v_pk_add_u16 v1, abs(v2), v3 // GFX9: :19: error: invalid operand for instruction v_pk_add_u16 v1, -v2, v3 - -// -// Packed operands on the non-packed VOP3P instructions -// - -// GFX9: invalid operand for instruction -v_mad_mix_f32 v1, v2, v3, v4 op_sel:[0,0,0] - -// GFX9: invalid operand for instruction -v_mad_mix_f32 v1, v2, v3, v4 op_sel_hi:[0,0,0] - -// GFX9: invalid operand for instruction -v_mad_mix_f32 v1, v2, v3, v4 neg_lo:[0,0,0] - -// GFX9: invalid operand for instruction -v_mad_mix_f32 v1, v2, v3, v4 neg_hi:[0,0,0] - -// GFX9: invalid operand for instruction -v_mad_mixlo_f16 v1, v2, v3, v4 op_sel:[0,0,0] - -// GFX9: invalid operand for instruction -v_mad_mixlo_f16 v1, v2, v3, v4 op_sel_hi:[0,0,0] - -// GFX9: invalid operand for instruction -v_mad_mixlo_f16 v1, v2, v3, v4 neg_lo:[0,0,0] - -// GFX9: invalid operand for instruction -v_mad_mixlo_f16 v1, v2, v3, v4 neg_hi:[0,0,0] - -// GFX9: invalid operand for instruction -v_mad_mixhi_f16 v1, v2, v3, v4 op_sel:[0,0,0] - -// GFX9: invalid operand for instruction -v_mad_mixhi_f16 v1, v2, v3, v4 op_sel_hi:[0,0,0] - -// GFX9: invalid operand for instruction -v_mad_mixhi_f16 v1, v2, v3, v4 neg_lo:[0,0,0] - -// GFX9: invalid operand for instruction -v_mad_mixhi_f16 v1, v2, v3, v4 neg_hi:[0,0,0] - // // Constant bus restrictions // diff --git a/test/MC/AMDGPU/vop3p.s b/test/MC/AMDGPU/vop3p.s index c9eda69e13d2a..97c3650cdf54f 100644 --- a/test/MC/AMDGPU/vop3p.s +++ b/test/MC/AMDGPU/vop3p.s @@ -169,48 +169,81 @@ v_pk_max_f16 v0, v1, v2 // GFX9: v_pk_max_f16 v0, v1, v2 ; encoding: [0x00,0x00,0x92,0xd3,0x01,0x05,0x02,0x18] v_mad_mix_f32 v0, v1, v2, v3 -// GFX9: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04] +// GFX9: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c] v_mad_mixlo_f16 v0, v1, v2, v3 -// GFX9: v_mad_mixlo_f16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa1,0xd3,0x01,0x05,0x0e,0x04] +// GFX9: v_mad_mixlo_f16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0xa1,0xd3,0x01,0x05,0x0e,0x1c] v_mad_mixhi_f16 v0, v1, v2, v3 -// GFX9: v_mad_mixhi_f16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa2,0xd3,0x01,0x05,0x0e,0x04] - +// GFX9: v_mad_mixhi_f16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0xa2,0xd3,0x01,0x05,0x0e,0x1c] // // Regular source modifiers on non-packed instructions // v_mad_mix_f32 v0, abs(v1), v2, v3 -// GFX9: v_mad_mix_f32 v0, |v1|, v2, v3 ; encoding: [0x00,0x01,0xa0,0xd3,0x01,0x05,0x0e,0x04] +// GFX9: v_mad_mix_f32 v0, |v1|, v2, v3 ; encoding: [0x00,0x41,0xa0,0xd3,0x01,0x05,0x0e,0x1c] v_mad_mix_f32 v0, v1, abs(v2), v3 -// GFX9: v_mad_mix_f32 v0, v1, |v2|, v3 ; encoding: [0x00,0x02,0xa0,0xd3,0x01,0x05,0x0e,0x04] +// GFX9: v_mad_mix_f32 v0, v1, |v2|, v3 ; encoding: [0x00,0x42,0xa0,0xd3,0x01,0x05,0x0e,0x1c] v_mad_mix_f32 v0, v1, v2, abs(v3) -// GFX9: v_mad_mix_f32 v0, v1, v2, |v3| ; encoding: [0x00,0x04,0xa0,0xd3,0x01,0x05,0x0e,0x04] +// GFX9: v_mad_mix_f32 v0, v1, v2, |v3| ; encoding: [0x00,0x44,0xa0,0xd3,0x01,0x05,0x0e,0x1c] v_mad_mix_f32 v0, -v1, v2, v3 -// GFX9: v_mad_mix_f32 v0, -v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x24] +// GFX9: v_mad_mix_f32 v0, -v1, v2, v3 ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x3c] v_mad_mix_f32 v0, v1, -v2, v3 -// GFX9: v_mad_mix_f32 v0, v1, -v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x44] +// GFX9: v_mad_mix_f32 v0, v1, -v2, v3 ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x5c] v_mad_mix_f32 v0, v1, v2, -v3 -// GFX9: v_mad_mix_f32 v0, v1, v2, -v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x84] +// GFX9: v_mad_mix_f32 v0, v1, v2, -v3 ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x9c] v_mad_mix_f32 v0, -abs(v1), v2, v3 -// GFX9: v_mad_mix_f32 v0, -|v1|, v2, v3 ; encoding: [0x00,0x01,0xa0,0xd3,0x01,0x05,0x0e,0x24] +// GFX9: v_mad_mix_f32 v0, -|v1|, v2, v3 ; encoding: [0x00,0x41,0xa0,0xd3,0x01,0x05,0x0e,0x3c] v_mad_mix_f32 v0, v1, -abs(v2), v3 -// GFX9: v_mad_mix_f32 v0, v1, -|v2|, v3 ; encoding: [0x00,0x02,0xa0,0xd3,0x01,0x05,0x0e,0x44] +// GFX9: v_mad_mix_f32 v0, v1, -|v2|, v3 ; encoding: [0x00,0x42,0xa0,0xd3,0x01,0x05,0x0e,0x5c] v_mad_mix_f32 v0, v1, v2, -abs(v3) -// GFX9: v_mad_mix_f32 v0, v1, v2, -|v3| ; encoding: [0x00,0x04,0xa0,0xd3,0x01,0x05,0x0e,0x84] +// GFX9: v_mad_mix_f32 v0, v1, v2, -|v3| ; encoding: [0x00,0x44,0xa0,0xd3,0x01,0x05,0x0e,0x9c] v_mad_mixlo_f16 v0, abs(v1), -v2, abs(v3) -// GFX9: v_mad_mixlo_f16 v0, |v1|, -v2, |v3| ; encoding: [0x00,0x05,0xa1,0xd3,0x01,0x05,0x0e,0x44] +// GFX9: v_mad_mixlo_f16 v0, |v1|, -v2, |v3| ; encoding: [0x00,0x45,0xa1,0xd3,0x01,0x05,0x0e,0x5c] v_mad_mixhi_f16 v0, -v1, abs(v2), -abs(v3) -// GFX9: v_mad_mixhi_f16 v0, -v1, |v2|, -|v3| ; encoding: [0x00,0x06,0xa2,0xd3,0x01,0x05,0x0e,0xa4] +// GFX9: v_mad_mixhi_f16 v0, -v1, |v2|, -|v3| ; encoding: [0x00,0x46,0xa2,0xd3,0x01,0x05,0x0e,0xbc] + +// +// op_sel with non-packed instructions +// + +v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0] +// GFX9: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,0,0] +// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x00,0x48,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,1,0] +// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x00,0x50,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] +// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x00,0x60,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,1,1] +// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,1,1] ; encoding: [0x00,0x78,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,0,0] +// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,0,0] ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,0,0] +// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x0c] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,1,0] +// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,1,0] ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x14] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,0,1] +// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,0,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,1,1] +// GFX9: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c] diff --git a/test/MC/ARM/elf-movt.s b/test/MC/ARM/elf-movt.s index 9df7a603b71a7..858e4aa41b29c 100644 --- a/test/MC/ARM/elf-movt.s +++ b/test/MC/ARM/elf-movt.s @@ -14,8 +14,20 @@ barf: @ @barf movw r0, :lower16:GOT-(.LPC0_2+8) movt r0, :upper16:GOT-(.LPC0_2+8) .LPC0_2: + movw r0, :lower16:extern_symbol+1234 + movt r0, :upper16:extern_symbol+1234 + + movw r0, :lower16:(foo - bar + 1234) + movt r0, :upper16:(foo - bar + 1234) +foo: +bar: + @ ASM: movw r0, :lower16:(GOT-(.LPC0_2+8)) @ ASM-NEXT: movt r0, :upper16:(GOT-(.LPC0_2+8)) +@ ASM: movw r0, :lower16:(extern_symbol+1234) +@ ASM-NEXT: movt r0, :upper16:(extern_symbol+1234) +@ ASM: movw r0, :lower16:((foo-bar)+1234) +@ ASM-NEXT: movt r0, :upper16:((foo-bar)+1234) @OBJ: Disassembly of section .text: @OBJ-NEXT: barf: @@ -23,6 +35,12 @@ barf: @ @barf @OBJ-NEXT: 00000000: R_ARM_MOVW_PREL_NC GOT @OBJ-NEXT: 4: f4 0f 4f e3 movt r0, #65524 @OBJ-NEXT: 00000004: R_ARM_MOVT_PREL GOT +@OBJ-NEXT: 8: d2 04 00 e3 movw r0, #1234 +@OBJ-NEXT: 00000008: R_ARM_MOVW_ABS_NC extern_symbol +@OBJ-NEXT: c: d2 04 40 e3 movt r0, #1234 +@OBJ-NEXT: 0000000c: R_ARM_MOVT_ABS extern_symbol +@OBJ-NEXT: 10: d2 04 00 e3 movw r0, #1234 +@OBJ-NEXT: 14: 00 00 40 e3 movt r0, #0 @THUMB: Disassembly of section .text: @THUMB-NEXT: barf: @@ -30,3 +48,9 @@ barf: @ @barf @THUMB-NEXT: 00000000: R_ARM_THM_MOVW_PREL_NC GOT @THUMB-NEXT: 4: cf f6 f4 70 movt r0, #65524 @THUMB-NEXT: 00000004: R_ARM_THM_MOVT_PREL GOT +@THUMB-NEXT: 8: 40 f2 d2 40 movw r0, #1234 +@THUMB-NEXT: 00000008: R_ARM_THM_MOVW_ABS_NC extern_symbol +@THUMB-NEXT: c: c0 f2 d2 40 movt r0, #1234 +@THUMB-NEXT: 0000000c: R_ARM_THM_MOVT_ABS extern_symbol +@THUMB-NEXT: 10: 40 f2 d2 40 movw r0, #1234 +@THUMB-NEXT: 14: c0 f2 00 00 movt r0, #0 diff --git a/test/MC/ARM/invalid-instructions-spellcheck.s b/test/MC/ARM/invalid-instructions-spellcheck.s new file mode 100644 index 0000000000000..ca118cff6ddf2 --- /dev/null +++ b/test/MC/ARM/invalid-instructions-spellcheck.s @@ -0,0 +1,68 @@ +@ RUN: not llvm-mc -triple=arm -show-encoding < %s 2>&1 | FileCheck %s +@ RUN: not llvm-mc -triple=thumb -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-THUMB + +@ This tests the mnemonic spell checker. + +@ First check what happens when an instruction is omitted: + + r1, r2, r3 + +@ CHECK: error: unexpected token in operand +@ CHECK-NEXT: r1, r2, r3 +@ CHECK-NEXT: ^ + +@ We don't want to see a suggestion here; the edit distance is too large to +@ give sensible suggestions: + + aaaaaaaaaaaaaaa r1, r2, r3 + +@ CHECK: error: invalid instruction +@ CHECK-NEXT: aaaaaaaaaaaaaaa r1, r2, r3 +@ CHECK-NEXT: ^ + +@ Check that we get one suggestion: 'pushh' is 1 edit away, i.e. an deletion. + + pushh r1, r2, r3 + +@CHECK: error: invalid instruction, did you mean: push? +@CHECK-NEXT: pushh r1, r2, r3 +@CHECK-NEXT: ^ + + adXd r1, r2, r3 + +@ Check edit distance 1 and 2: 'add' has edit distance of 1 (a deletion), +@ and 'qadd' a distance of 2 (a deletion and an insertion) + +@ CHECK: error: invalid instruction, did you mean: add, qadd? +@ CHECK-NEXT: adXd r1, r2, r3 +@ CHECK-NEXT: ^ + +@ Check edit distance 1 and 2, just insertions: + + ad r1, r2, r3 + +@ CHECK: error: invalid instruction, did you mean: adc, add, adr, and, qadd? +@ CHECK-NEXT: ad r1, r2, r3 +@ CHECK-NEXT: ^ + +@ Check an instruction that is 2 edits away, and also has a lot of candidates: + + ldre r1, r2, r3 + +@ CHECK: error: invalid instruction, did you mean: ldr, ldrb, ldrd, ldrex, ldrexb, ldrexd, ldrexh, ldrh, ldrt? +@ CHECK-NEXT: ldre r1, r2, r3 +@ CHECK-NEXT: ^ + +@ Here it is checked that we don't suggest instructions that are not supported. +@ For example, in Thumb mode we don't want to see suggestions 'faddd' of 'qadd' +@ because they are not supported. + + fadd r1, r2, r3 + +@ CHECK-THUMB: error: invalid instruction, did you mean: add? +@ CHECK-THUMB: fadd r1, r2, r3 +@ CHECK-THUMB: ^ + +@ CHECK: error: invalid instruction, did you mean: add, qadd? +@ CHECK-NEXT: fadd r1, r2, r3 +@ CHECK-NEXT: ^ diff --git a/test/MC/ARM/ldr-pseudo-unpredictable.s b/test/MC/ARM/ldr-pseudo-unpredictable.s index b275dc71ab4b6..ad5a176e0433d 100644 --- a/test/MC/ARM/ldr-pseudo-unpredictable.s +++ b/test/MC/ARM/ldr-pseudo-unpredictable.s @@ -1,8 +1,8 @@ @RUN: llvm-mc -triple armv5-unknown-linux-gnueabi %s | FileCheck --check-prefix=CHECK-ARM %s -@RUN: not llvm-mc -triple thumbv7-unknown-linux-gnueabi %s 2>&1 | FileCheck --check-prefix=CHECK-SP %s +@RUN: llvm-mc -triple thumbv7-unknown-linux-gnueabi %s 2>&1 | FileCheck --check-prefix=CHECK-T2 %s @RUN: not llvm-mc -triple thumbv5-unknown-linux-gnueabi %s 2>&1 | FileCheck --check-prefix=CHECK-NONE %s @RUN: llvm-mc -triple armv5-base-apple-darwin %s | FileCheck --check-prefix=CHECK-DARWIN-ARM %s -@RUN: not llvm-mc -triple thumbv7-base-apple-darwin %s 2>&1 | FileCheck --check-prefix=CHECK-DARWIN-SP %s +@RUN: llvm-mc -triple thumbv7-base-apple-darwin %s 2>&1 | FileCheck --check-prefix=CHECK-DARWIN-T2 %s @RUN: not llvm-mc -triple thumbv5-base.apple.darwin %s 2>&1 | FileCheck --check-prefix=CHECK-NONE %s @ We dont't do the transformation for rt = sp or pc @@ -10,12 +10,12 @@ ldr pc, = 0x4 @ CHECK-ARM: ldr pc, .Ltmp[[TMP0:[0-9]+]] @ CHECK-DARWIN-ARM: ldr pc, Ltmp0 -@ CHECK-SP: error: instruction requires: arm-mode -@ CHECK-DARWIN-SP: error: instruction requires: arm-mode -@ CHECK-NONE: error: instruction requires: arm-mode +@ CHECK-T2: ldr.w pc, .Ltmp[[TMP0:[0-9]+]] +@ CHECK-DARWIN-T2: ldr.w pc, Ltmp0 +@ CHECK-NONE: error: instruction requires: thumb2 ldr sp, = 0x8 @ CHECK-ARM: ldr sp, .Ltmp[[TMP1:[0-9]+]] @ CHECK-DARWIN-ARM: ldr sp, Ltmp1 -@ CHECK-SP: ldr.w sp, .Ltmp[[TMP0:[0-9]+]] -@ CHECK-DARWIN-SP: ldr.w sp, Ltmp0 -@ CHECK-NONE: error: instruction requires: arm-mode +@ CHECK-T2: ldr.w sp, .Ltmp[[TMP1:[0-9]+]] +@ CHECK-DARWIN-T2: ldr.w sp, Ltmp1 +@ CHECK-NONE: error: instruction requires: thumb2 diff --git a/test/MC/COFF/bad-expr.s b/test/MC/COFF/bad-expr.s index ecbdd415c3a61..cbbd5d0c946f7 100644 --- a/test/MC/COFF/bad-expr.s +++ b/test/MC/COFF/bad-expr.s @@ -1,7 +1,6 @@ // RUN: not llvm-mc -filetype=obj -triple i386-pc-win32 %s 2>&1 | FileCheck %s -// CHECK: symbol '__ImageBase' can not be undefined in a subtraction expression - .data _x: +// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: symbol '__ImageBase' can not be undefined in a subtraction expression .long _x-__ImageBase diff --git a/test/MC/COFF/cv-def-range-gap.s b/test/MC/COFF/cv-def-range-gap.s index 9c1531819963f..29f2def8e1bfc 100644 --- a/test/MC/COFF/cv-def-range-gap.s +++ b/test/MC/COFF/cv-def-range-gap.s @@ -2,12 +2,13 @@ # This tries to test defrange gap edge cases. -# CHECK: Local { +# CHECK: LocalSym { # CHECK: Type: int (0x74) # CHECK: VarName: p # CHECK: } -# CHECK-NOT: Local { -# CHECK: DefRangeRegister { +# CHECK-NOT: LocalSym { +# CHECK: DefRangeRegisterSym { +# CHECK-NEXT: Kind: S_DEFRANGE_REGISTER (0x1141) # CHECK-NEXT: Register: 23 # CHECK-NEXT: MayHaveNoName: 0 # CHECK-NEXT: LocalVariableAddrRange { @@ -20,7 +21,8 @@ # CHECK-NEXT: Range: 0x1 # CHECK-NEXT: ] # CHECK-NEXT: } -# CHECK-NEXT: DefRangeRegister { +# CHECK-NEXT: DefRangeRegisterSym { +# CHECK-NEXT: Kind: S_DEFRANGE_REGISTER (0x1141) # CHECK-NEXT: Register: 23 # CHECK-NEXT: MayHaveNoName: 0 # CHECK-NEXT: LocalVariableAddrRange { @@ -29,7 +31,8 @@ # CHECK-NEXT: Range: 0x6 # CHECK-NEXT: } # CHECK-NEXT: } -# CHECK-NEXT: DefRangeRegister { +# CHECK-NEXT: DefRangeRegisterSym { +# CHECK-NEXT: Kind: S_DEFRANGE_REGISTER (0x1141) # CHECK-NEXT: Register: 23 # CHECK-NEXT: MayHaveNoName: 0 # CHECK-NEXT: LocalVariableAddrRange { @@ -38,7 +41,8 @@ # CHECK-NEXT: Range: 0x1 # CHECK-NEXT: } # CHECK-NEXT: } -# CHECK-NEXT: DefRangeRegister { +# CHECK-NEXT: DefRangeRegisterSym { +# CHECK-NEXT: Kind: S_DEFRANGE_REGISTER (0x1141) # CHECK-NEXT: Register: 23 # CHECK-NEXT: MayHaveNoName: 0 # CHECK-NEXT: LocalVariableAddrRange { diff --git a/test/MC/COFF/cv-def-range.s b/test/MC/COFF/cv-def-range.s index 5ac0df7f7d964..7a90ec263683c 100644 --- a/test/MC/COFF/cv-def-range.s +++ b/test/MC/COFF/cv-def-range.s @@ -77,18 +77,18 @@ Ltmp3: .short 4431 # Record kind: S_PROC_ID_END .cv_def_range Lvar_begin0 Lvar_end0, "\102\021\374\377\377\377" -# CHECK: DefRangeFramePointerRel { +# CHECK: DefRangeFramePointerRelSym { # CHECK: Offset: -4 # CHECK: LocalVariableAddrRange { # CHECK: OffsetStart: .text+0x9 # CHECK: ISectStart: 0x0 # CHECK: Range: 0xF # CHECK: } +# CHECK: BlockRelocations [ +# CHECK: 0x4 IMAGE_REL_I386_SECREL .text +# CHECK: 0x8 IMAGE_REL_I386_SECTION .text +# CHECK: ] # CHECK: } -# CHECK: BlockRelocations [ -# CHECK: 0x4 IMAGE_REL_I386_SECREL .text -# CHECK: 0x8 IMAGE_REL_I386_SECTION .text -# CHECK: ] Ltmp1: .p2align 2 diff --git a/test/MC/COFF/cv-inline-linetable-infloop.s b/test/MC/COFF/cv-inline-linetable-infloop.s index 804ed6f404d99..6b8e708befc4e 100644 --- a/test/MC/COFF/cv-inline-linetable-infloop.s +++ b/test/MC/COFF/cv-inline-linetable-infloop.s @@ -1,6 +1,6 @@ # RUN: llvm-mc -triple=x86_64-pc-win32 -filetype=obj < %s | llvm-readobj -codeview | FileCheck %s -# CHECK: InlineSite { +# CHECK: InlineSiteSym { # CHECK: BinaryAnnotations [ # CHECK: ChangeLineOffset: 1 # CHECK: ChangeCodeLength: 0x2 diff --git a/test/MC/COFF/cv-inline-linetable-unlikely.s b/test/MC/COFF/cv-inline-linetable-unlikely.s index dd3a66f419cc4..bfb745bd9bb1e 100644 --- a/test/MC/COFF/cv-inline-linetable-unlikely.s +++ b/test/MC/COFF/cv-inline-linetable-unlikely.s @@ -19,13 +19,13 @@ # calls to __asan_report*, for which it is very important to have an accurate # stack trace. -# CHECK: ProcStart { +# CHECK: GlobalProcIdSym { # CHECK: FunctionType: g (0x1003) # CHECK: CodeOffset: g+0x0 # CHECK: DisplayName: g # CHECK: LinkageName: g # CHECK: } -# CHECK: InlineSite { +# CHECK: InlineSiteSym { # CHECK: Inlinee: f (0x1002) # CHECK: BinaryAnnotations [ # CHECK-NEXT: ChangeCodeOffsetAndLineOffset: {CodeOffset: 0xE, LineOffset: 1} diff --git a/test/MC/COFF/cv-inline-linetable-unreachable.s b/test/MC/COFF/cv-inline-linetable-unreachable.s index 0f29d1667c359..d894fc758fb15 100644 --- a/test/MC/COFF/cv-inline-linetable-unreachable.s +++ b/test/MC/COFF/cv-inline-linetable-unreachable.s @@ -76,7 +76,7 @@ Ltmp6: .short 4429 .asciz "\000\000\000\000\000\000\000\000\003\020\000" .cv_inline_linetable 1 1 3 Lfunc_begin0 Lfunc_end0 -# CHECK: InlineSite { +# CHECK: InlineSiteSym { # CHECK: PtrParent: 0x0 # CHECK: PtrEnd: 0x0 # CHECK: Inlinee: f (0x1003) diff --git a/test/MC/COFF/cv-inline-linetable.s b/test/MC/COFF/cv-inline-linetable.s index bb68fcde21be2..2c89f9836c423 100644 --- a/test/MC/COFF/cv-inline-linetable.s +++ b/test/MC/COFF/cv-inline-linetable.s @@ -88,7 +88,7 @@ Ltmp4: .short 4429 .asciz "\000\000\000\000\000\000\000\000\003\020\000" .cv_inline_linetable 1 1 9 Lfunc_begin0 Lfunc_end0 -# CHECK: InlineSite { +# CHECK: InlineSiteSym { # CHECK: PtrParent: 0x0 # CHECK: PtrEnd: 0x0 # CHECK: Inlinee: bar (0x1003) @@ -106,7 +106,7 @@ Ltmp6: .short 4429 .asciz "\000\000\000\000\000\000\000\000\004\020\000" .cv_inline_linetable 2 1 3 Lfunc_begin0 Lfunc_end0 -# CHECK: InlineSite { +# CHECK: InlineSiteSym { # CHECK: PtrParent: 0x0 # CHECK: PtrEnd: 0x0 # CHECK: Inlinee: foo (0x1004) diff --git a/test/MC/Disassembler/Mips/mt/valid-r2-el.txt b/test/MC/Disassembler/Mips/mt/valid-r2-el.txt new file mode 100644 index 0000000000000..62e7092086aa0 --- /dev/null +++ b/test/MC/Disassembler/Mips/mt/valid-r2-el.txt @@ -0,0 +1,32 @@ +# RUN: llvm-mc --disassemble %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+mt | FileCheck %s +0xc1 0x0b 0x60 0x41 # CHECK: dmt +0xc1 0x0b 0x65 0x41 # CHECK: dmt $5 +0xe1 0x0b 0x60 0x41 # CHECK: emt +0xe1 0x0b 0x64 0x41 # CHECK: emt $4 +0x01 0x00 0x60 0x41 # CHECK: dvpe +0x01 0x00 0x66 0x41 # CHECK: dvpe $6 +0x21 0x00 0x60 0x41 # CHECK: evpe +0x21 0x00 0x64 0x41 # CHECK: evpe $4 +0x08 0x10 0x65 0x7c # CHECK: fork $2, $3, $5 +0x09 0x00 0x80 0x7c # CHECK: yield $4 +0x09 0x20 0xa0 0x7c # CHECK: yield $4, $5 +0x02 0x20 0x05 0x41 # CHECK: mftr $4, $5, 0, 2, 0 +0x20 0x20 0x05 0x41 # CHECK: mftr $4, $5, 1, 0, 0 +0x21 0x20 0x00 0x41 # CHECK: mftr $4, $zero, 1, 1, 0 +0x21 0x20 0x0a 0x41 # CHECK: mftr $4, $10, 1, 1, 0 +0x22 0x20 0x0a 0x41 # CHECK: mftr $4, $10, 1, 2, 0 +0x32 0x20 0x0a 0x41 # CHECK: mftr $4, $10, 1, 2, 1 +0x23 0x20 0x1a 0x41 # CHECK: mftr $4, $26, 1, 3, 0 +0x23 0x20 0x1f 0x41 # CHECK: mftr $4, $ra, 1, 3, 0 +0x24 0x20 0x0e 0x41 # CHECK: mftr $4, $14, 1, 4, 0 +0x25 0x20 0x0f 0x41 # CHECK: mftr $4, $15, 1, 5, 0 +0x02 0x28 0x84 0x41 # CHECK: mttr $4, $5, 0, 2, 0 +0x20 0x28 0x84 0x41 # CHECK: mttr $4, $5, 1, 0, 0 +0x21 0x00 0x84 0x41 # CHECK: mttr $4, $zero, 1, 1, 0 +0x21 0x50 0x84 0x41 # CHECK: mttr $4, $10, 1, 1, 0 +0x22 0x50 0x84 0x41 # CHECK: mttr $4, $10, 1, 2, 0 +0x32 0x50 0x84 0x41 # CHECK: mttr $4, $10, 1, 2, 1 +0x23 0xd0 0x84 0x41 # CHECK: mttr $4, $26, 1, 3, 0 +0x23 0xf8 0x84 0x41 # CHECK: mttr $4, $ra, 1, 3, 0 +0x24 0x70 0x84 0x41 # CHECK: mttr $4, $14, 1, 4, 0 +0x25 0x78 0x84 0x41 # CHECK: mttr $4, $15, 1, 5, 0 diff --git a/test/MC/Disassembler/Mips/mt/valid-r2.txt b/test/MC/Disassembler/Mips/mt/valid-r2.txt new file mode 100644 index 0000000000000..4786d8b5591f4 --- /dev/null +++ b/test/MC/Disassembler/Mips/mt/valid-r2.txt @@ -0,0 +1,32 @@ +# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r2 -mattr=+mt | FileCheck %s +0x41 0x60 0x0b 0xc1 # CHECK: dmt +0x41 0x65 0x0b 0xc1 # CHECK: dmt $5 +0x41 0x60 0x0b 0xe1 # CHECK: emt +0x41 0x64 0x0b 0xe1 # CHECK: emt $4 +0x41 0x60 0x00 0x01 # CHECK: dvpe +0x41 0x66 0x00 0x01 # CHECK: dvpe $6 +0x41 0x60 0x00 0x21 # CHECK: evpe +0x41 0x64 0x00 0x21 # CHECK: evpe $4 +0x7c 0x65 0x10 0x08 # CHECK: fork $2, $3, $5 +0x7c 0x80 0x00 0x09 # CHECK: yield $4 +0x7c 0xa0 0x20 0x09 # CHECK: yield $4, $5 +0x41 0x05 0x20 0x02 # CHECK: mftr $4, $5, 0, 2, 0 +0x41 0x05 0x20 0x20 # CHECK: mftr $4, $5, 1, 0, 0 +0x41 0x00 0x20 0x21 # CHECK: mftr $4, $zero, 1, 1, 0 +0x41 0x0a 0x20 0x21 # CHECK: mftr $4, $10, 1, 1, 0 +0x41 0x0a 0x20 0x22 # CHECK: mftr $4, $10, 1, 2, 0 +0x41 0x0a 0x20 0x32 # CHECK: mftr $4, $10, 1, 2, 1 +0x41 0x1a 0x20 0x23 # CHECK: mftr $4, $26, 1, 3, 0 +0x41 0x1f 0x20 0x23 # CHECK: mftr $4, $ra, 1, 3, 0 +0x41 0x0e 0x20 0x24 # CHECK: mftr $4, $14, 1, 4, 0 +0x41 0x0f 0x20 0x25 # CHECK: mftr $4, $15, 1, 5, 0 +0x41 0x84 0x28 0x02 # CHECK: mttr $4, $5, 0, 2, 0 +0x41 0x84 0x28 0x20 # CHECK: mttr $4, $5, 1, 0, 0 +0x41 0x84 0x00 0x21 # CHECK: mttr $4, $zero, 1, 1, 0 +0x41 0x84 0x50 0x21 # CHECK: mttr $4, $10, 1, 1, 0 +0x41 0x84 0x50 0x22 # CHECK: mttr $4, $10, 1, 2, 0 +0x41 0x84 0x50 0x32 # CHECK: mttr $4, $10, 1, 2, 1 +0x41 0x84 0xd0 0x23 # CHECK: mttr $4, $26, 1, 3, 0 +0x41 0x84 0xf8 0x23 # CHECK: mttr $4, $ra, 1, 3, 0 +0x41 0x84 0x70 0x24 # CHECK: mttr $4, $14, 1, 4, 0 +0x41 0x84 0x78 0x25 # CHECK: mttr $4, $15, 1, 5, 0 diff --git a/test/MC/ELF/bad-expr3.s b/test/MC/ELF/bad-expr3.s index 990167cda53f8..cf5d6f47335f5 100644 --- a/test/MC/ELF/bad-expr3.s +++ b/test/MC/ELF/bad-expr3.s @@ -1,8 +1,7 @@ // RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o /dev/null \ // RUN: 2>&1 | FileCheck %s -// CHECK: Cannot represent a difference across sections - +// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: Cannot represent a difference across sections .long foo - bar .section .zed foo: diff --git a/test/MC/Mips/addend.s b/test/MC/Mips/addend.s new file mode 100644 index 0000000000000..93ce4f413aebe --- /dev/null +++ b/test/MC/Mips/addend.s @@ -0,0 +1,21 @@ +# RUN: llvm-mc -filetype=obj -triple=mips-unknown-linux -mcpu=mips32r6 %s -o %t.o +# RUN: llvm-readobj -s -section-data %t.o | FileCheck %s + +# CHECK: Name: .text +# CHECK-NEXT: Type: +# CHECK-NEXT: Flags [ +# CHECK-NEXT: SHF_ALLOC +# CHECK-NEXT: SHF_EXECINSTR +# CHECK-NEXT: ] +# CHECK-NEXT: Address: +# CHECK-NEXT: Offset: +# CHECK-NEXT: Size: +# CHECK-NEXT: Link: +# CHECK-NEXT: Info: +# CHECK-NEXT: AddressAlignment: +# CHECK-NEXT: EntrySize: +# CHECK-NEXT: SectionData ( +# CHECK-NEXT: 0000: 00000008 | +# CHECK-NEXT: ) + + .word _foo+8-. diff --git a/test/MC/Mips/mt/abiflag.s b/test/MC/Mips/mt/abiflag.s new file mode 100644 index 0000000000000..b4769cba4c2d1 --- /dev/null +++ b/test/MC/Mips/mt/abiflag.s @@ -0,0 +1,10 @@ +# RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -mattr=+mt -filetype=obj -o - \ +# RUN: | llvm-readobj -mips-abi-flags | FileCheck %s + +# Test that the usage of the MT ASE is recorded in .MIPS.abiflags + +# CHECK: ASEs +# CHECK-NEXT: MT (0x40) + + .text + nop diff --git a/test/MC/Mips/mt/invalid-wrong-error.s b/test/MC/Mips/mt/invalid-wrong-error.s new file mode 100644 index 0000000000000..0247089b70ae6 --- /dev/null +++ b/test/MC/Mips/mt/invalid-wrong-error.s @@ -0,0 +1,3 @@ +# RUN: not llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt < %s 2>&1 | FileCheck %s + mftr 0($4), $5, 0, 0, 0 # CHECK: error: unexpected token in argument list + mttr 0($4), $5, 0, 0, 0 # CHECK: error: unexpected token in argument list diff --git a/test/MC/Mips/mt/invalid.s b/test/MC/Mips/mt/invalid.s new file mode 100644 index 0000000000000..d4055c4a50f44 --- /dev/null +++ b/test/MC/Mips/mt/invalid.s @@ -0,0 +1,27 @@ +# RUN: not llvm-mc -arch=mips -mcpu=mips32 -mattr=+mt < %s 2>&1 | FileCheck %s + dmt 4 # CHECK: error: invalid operand for instruction + dmt $4, $5 # CHECK: error: invalid operand for instruction + dmt $5, 0($4) # CHECK: error: invalid operand for instruction + emt 4 # CHECK: error: invalid operand for instruction + emt $4, $5 # CHECK: error: invalid operand for instruction + emt $5, 0($5) # CHECK: error: invalid operand for instruction + dvpe 4 # CHECK: error: invalid operand for instruction + dvpe $4, $5 # CHECK: error: invalid operand for instruction + dvpe $5, 0($4) # CHECK: error: invalid operand for instruction + evpe 4 # CHECK: error: invalid operand for instruction + evpe $4, $5 # CHECK: error: invalid operand for instruction + evpe $5, 0($5) # CHECK: error: invalid operand for instruction + mftr $4, 0($5), 0, 0, 0 #Â CHECK: error: invalid operand for instruction + mftr $4, $5, 2, 0, 0 #Â CHECK: error: expected 1-bit unsigned immediate + mftr $4, $5, -1, 0, 0 #Â CHECK: error: expected 1-bit unsigned immediate + mftr $4, $5, 0, 8, 0 #Â CHECK: error: expected 3-bit unsigned immediate + mftr $4, $5, 0, -1, 0 #Â CHECK: error: expected 3-bit unsigned immediate + mftr $4, $4, 0, 0, 2 #Â CHECK: error: expected 1-bit unsigned immediate + mftr $4, $5, 0, 0, -1 #Â CHECK: error: expected 1-bit unsigned immediate + mttr $4, 0($5), 0, 0, 0 #Â CHECK: error: invalid operand for instruction + mttr $4, $5, 2, 0, 0 #Â CHECK: error: expected 1-bit unsigned immediate + mttr $4, $5, -1, 0, 0 #Â CHECK: error: expected 1-bit unsigned immediate + mttr $4, $5, 0, 8, 0 #Â CHECK: error: expected 3-bit unsigned immediate + mttr $4, $5, 0, -1, 0 #Â CHECK: error: expected 3-bit unsigned immediate + mttr $4, $4, 0, 0, 2 #Â CHECK: error: expected 1-bit unsigned immediate + mttr $4, $5, 0, 0, -1 #Â CHECK: error: expected 1-bit unsigned immediate diff --git a/test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s b/test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s new file mode 100644 index 0000000000000..4e872412e6ef2 --- /dev/null +++ b/test/MC/Mips/mt/mftr-mttr-aliases-invalid-wrong-error.s @@ -0,0 +1,18 @@ +# RUN: not llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s \ +# RUN: 2>&1 | FileCheck %s + +# The integrated assembler produces a wrong or misleading error message. + + mftc0 0($4), $5 # CHECK: error: unexpected token in argument list + mftc0 0($4), $5, 1 # CHECK: error: unexpected token in argument list + mftgpr 0($4), $5 # CHECK: error: unexpected token in argument list + mftlo 0($3) # CHECK: error: unexpected token in argument list + mftlo 0($3), $ac1 # CHECK: error: unexpected token in argument list + mfthi 0($3) # CHECK: error: unexpected token in argument list + mfthi 0($3), $ac1 # CHECK: error: unexpected token in argument list + mftacx 0($3) # CHECK: error: unexpected token in argument list + mftacx 0($3), $ac1 # CHECK: error: unexpected token in argument list + mftdsp 0($4) # CHECK: error: unexpected token in argument list + mftc1 0($4), $f4 # CHECK: error: unexpected token in argument list + mfthc1 0($4), $f4 # CHECK: error: unexpected token in argument list + cftc1 0($4), $f8 # CHECK: error: unexpected token in argument list diff --git a/test/MC/Mips/mt/mftr-mttr-aliases-invalid.s b/test/MC/Mips/mt/mftr-mttr-aliases-invalid.s new file mode 100644 index 0000000000000..06ae8c72e654a --- /dev/null +++ b/test/MC/Mips/mt/mftr-mttr-aliases-invalid.s @@ -0,0 +1,23 @@ +# RUN: not llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s \ +# RUN: 2>&1 | FileCheck %s + + mftc0 $4, 0($5) # CHECK: error: invalid operand for instruction + mftc0 $4, 0($5), 1 # CHECK: error: invalid operand for instruction + mftc0 $4, $5, -1 # CHECK: error: expected 3-bit unsigned immediate + mftc0 $4, $5, 9 # CHECK: error: expected 3-bit unsigned immediate + mftc0 $4, $5, $6 # CHECK: error: expected 3-bit unsigned immediate + mftgpr $4, 0($5) # CHECK: error: invalid operand for instruction + mftgpr $4, $5, $6 # CHECK: error: invalid operand for instruction + mftlo $3, 0($ac1) # CHECK: error: invalid operand for instruction + mftlo $4, $ac1, $4 # CHECK: error: invalid operand for instruction + mfthi $3, 0($ac1) # CHECK: error: invalid operand for instruction + mfthi $4, $ac1, $4 # CHECK: error: invalid operand for instruction + mftacx $3, 0($ac1) # CHECK: error: invalid operand for instruction + mftacx $4, $ac1, $4 # CHECK: error: invalid operand for instruction + mftdsp $4, $5 # CHECK: error: invalid operand for instruction + mftdsp $4, $f5 # CHECK: error: invalid operand for instruction + mftdsp $4, $ac0 # CHECK: error: invalid operand for instruction + mftc1 $4, 0($f4) # CHECK: error: invalid operand for instruction + mfthc1 $4, 0($f4) # CHECK: error: invalid operand for instruction + cftc1 $4, 0($f4) # CHECK: error: invalid operand for instruction + cftc1 $4, $f4, $5 # CHECK: error: invalid operand for instruction diff --git a/test/MC/Mips/mt/mftr-mttr-aliases.s b/test/MC/Mips/mt/mftr-mttr-aliases.s new file mode 100644 index 0000000000000..92ed9f9281f20 --- /dev/null +++ b/test/MC/Mips/mt/mftr-mttr-aliases.s @@ -0,0 +1,47 @@ +# RUN: llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s | FileCheck %s + +# Check the various aliases of the m[ft]tr instruction. + + mftc0 $4, $5 # CHECK: mftr $4, $5, 0, 0, 0 # encoding: [0x41,0x05,0x20,0x00] + mftc0 $6, $7, 1 # CHECK: mftr $6, $7, 0, 1, 0 # encoding: [0x41,0x07,0x30,0x01] + mftgpr $5, $9 # CHECK: mftr $5, $9, 1, 0, 0 # encoding: [0x41,0x09,0x28,0x20] + mftlo $3 # CHECK: mftr $3, $zero, 1, 1, 0 # encoding: [0x41,0x00,0x18,0x21] + mftlo $3, $ac0 # CHECK: mftr $3, $zero, 1, 1, 0 # encoding: [0x41,0x00,0x18,0x21] + mftlo $3, $ac1 # CHECK: mftr $3, $4, 1, 1, 0 # encoding: [0x41,0x04,0x18,0x21] + mftlo $3, $ac2 # CHECK: mftr $3, $8, 1, 1, 0 # encoding: [0x41,0x08,0x18,0x21] + mftlo $3, $ac3 # CHECK: mftr $3, $12, 1, 1, 0 # encoding: [0x41,0x0c,0x18,0x21] + mfthi $3, $ac0 # CHECK: mftr $3, $1, 1, 1, 0 # encoding: [0x41,0x01,0x18,0x21] + mfthi $3, $ac1 # CHECK: mftr $3, $5, 1, 1, 0 # encoding: [0x41,0x05,0x18,0x21] + mfthi $3, $ac2 # CHECK: mftr $3, $9, 1, 1, 0 # encoding: [0x41,0x09,0x18,0x21] + mfthi $3, $ac3 # CHECK: mftr $3, $13, 1, 1, 0 # encoding: [0x41,0x0d,0x18,0x21] + mftacx $3, $ac0 # CHECK: mftr $3, $2, 1, 1, 0 # encoding: [0x41,0x02,0x18,0x21] + mftacx $3, $ac1 # CHECK: mftr $3, $6, 1, 1, 0 # encoding: [0x41,0x06,0x18,0x21] + mftacx $3, $ac2 # CHECK: mftr $3, $10, 1, 1, 0 # encoding: [0x41,0x0a,0x18,0x21] + mftacx $3, $ac3 # CHECK: mftr $3, $14, 1, 1, 0 # encoding: [0x41,0x0e,0x18,0x21] + mftdsp $4 # CHECK: mftr $4, $16, 1, 1, 0 # encoding: [0x41,0x10,0x20,0x21] + mftc1 $4, $f5 # CHECK: mftr $4, $5, 1, 2, 0 # encoding: [0x41,0x05,0x20,0x22] + mfthc1 $4, $f5 # CHECK: mftr $4, $5, 1, 2, 1 # encoding: [0x41,0x05,0x20,0x32] + cftc1 $4, $f9 # CHECK: mftr $4, $9, 1, 3, 0 # encoding: [0x41,0x09,0x20,0x23] + + mttc0 $4, $5 # CHECK: mttr $4, $5, 0, 0, 0 # encoding: [0x41,0x84,0x28,0x00] + mttc0 $6, $7, 1 # CHECK: mttr $6, $7, 0, 1, 0 # encoding: [0x41,0x86,0x38,0x01] + mttgpr $5, $9 # CHECK: mttr $5, $9, 1, 0, 0 # encoding: [0x41,0x85,0x48,0x20] + mttlo $3 # CHECK: mttr $3, $zero, 1, 1, 0 # encoding: [0x41,0x83,0x00,0x21] + mttlo $3, $ac0 # CHECK: mttr $3, $zero, 1, 1, 0 # encoding: [0x41,0x83,0x00,0x21] + mttlo $3, $ac1 # CHECK: mttr $3, $4, 1, 1, 0 # encoding: [0x41,0x83,0x20,0x21] + mttlo $3, $ac2 # CHECK: mttr $3, $8, 1, 1, 0 # encoding: [0x41,0x83,0x40,0x21] + mttlo $3, $ac3 # CHECK: mttr $3, $12, 1, 1, 0 # encoding: [0x41,0x83,0x60,0x21] + mtthi $3 # CHECK: mttr $3, $1, 1, 1, 0 # encoding: [0x41,0x83,0x08,0x21] + mtthi $3, $ac0 # CHECK: mttr $3, $1, 1, 1, 0 # encoding: [0x41,0x83,0x08,0x21] + mtthi $3, $ac1 # CHECK: mttr $3, $5, 1, 1, 0 # encoding: [0x41,0x83,0x28,0x21] + mtthi $3, $ac2 # CHECK: mttr $3, $9, 1, 1, 0 # encoding: [0x41,0x83,0x48,0x21] + mtthi $3, $ac3 # CHECK: mttr $3, $13, 1, 1, 0 # encoding: [0x41,0x83,0x68,0x21] + mttacx $3 # CHECK: mttr $3, $2, 1, 1, 0 # encoding: [0x41,0x83,0x10,0x21] + mttacx $3, $ac0 # CHECK: mttr $3, $2, 1, 1, 0 # encoding: [0x41,0x83,0x10,0x21] + mttacx $3, $ac1 # CHECK: mttr $3, $6, 1, 1, 0 # encoding: [0x41,0x83,0x30,0x21] + mttacx $3, $ac2 # CHECK: mttr $3, $10, 1, 1, 0 # encoding: [0x41,0x83,0x50,0x21] + mttacx $3, $ac3 # CHECK: mttr $3, $14, 1, 1, 0 # encoding: [0x41,0x83,0x70,0x21] + mttdsp $4 # CHECK: mttr $4, $16, 1, 1, 0 # encoding: [0x41,0x84,0x80,0x21] + mttc1 $4, $f5 # CHECK: mttr $4, $5, 1, 2, 0 # encoding: [0x41,0x84,0x28,0x22] + mtthc1 $4, $f5 # CHECK: mttr $4, $5, 1, 2, 1 # encoding: [0x41,0x84,0x28,0x32] + cttc1 $4, $f9 # CHECK: mttr $4, $9, 1, 3, 0 # encoding: [0x41,0x84,0x48,0x23] diff --git a/test/MC/Mips/mt/mftr-mttr-reserved-valid.s b/test/MC/Mips/mt/mftr-mttr-reserved-valid.s new file mode 100644 index 0000000000000..c40e81bfc7d75 --- /dev/null +++ b/test/MC/Mips/mt/mftr-mttr-reserved-valid.s @@ -0,0 +1,8 @@ +# RUN: llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s | FileCheck %s + +# The selector value and register values here are marked as reserved in the +# documentation, but GAS accepts them without warning. + mftr $31, $31, 1, 1, 0 # CHECK: mftr $ra, $ra, 1, 1, 0 # encoding: [0x41,0x1f,0xf8,0x21] + mttr $31, $31, 1, 1, 0 # CHECK: mttr $ra, $ra, 1, 1, 0 # encoding: [0x41,0x9f,0xf8,0x21] + mftr $31, $13, 1, 6, 0 # CHECK: mftr $ra, $13, 1, 6, 0 # encoding: [0x41,0x0d,0xf8,0x26] + mttr $31, $13, 1, 6, 0 # CHECK: mttr $ra, $13, 1, 6, 0 # encoding: [0x41,0x9f,0x68,0x26] diff --git a/test/MC/Mips/mt/module-directive-invalid.s b/test/MC/Mips/mt/module-directive-invalid.s new file mode 100644 index 0000000000000..38baaa07cdc17 --- /dev/null +++ b/test/MC/Mips/mt/module-directive-invalid.s @@ -0,0 +1,6 @@ +# RUN: not llvm-mc -arch=mips -mcpu=mips32r5 < %s 2>&1 | FileCheck %s + +# CHECK: error: .module directive must appear before any code + .set nomips16 + .module mt + nop diff --git a/test/MC/Mips/mt/module-directive.s b/test/MC/Mips/mt/module-directive.s new file mode 100644 index 0000000000000..d316f054eaae3 --- /dev/null +++ b/test/MC/Mips/mt/module-directive.s @@ -0,0 +1,16 @@ +# RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=obj -o - | \ +# RUN: llvm-readobj -mips-abi-flags | FileCheck --check-prefix=CHECK-OBJ %s +# RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=asm -o - | \ +# RUN: FileCheck --check-prefix=CHECK-ASM %s + +# Test that the .module directive sets the MT flag in .MIPS.abiflags when +# assembling to boject files. + +# Test that the .moodule directive is re-emitted when expanding assembly. + +# CHECK-OBJ: ASEs +# CHECK-OBJ-NEXT: MT (0x40) + +# CHECK-ASM: .module mt +.module mt +nop diff --git a/test/MC/Mips/mt/set-directive.s b/test/MC/Mips/mt/set-directive.s new file mode 100644 index 0000000000000..53ed4b273795d --- /dev/null +++ b/test/MC/Mips/mt/set-directive.s @@ -0,0 +1,14 @@ +# RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=obj -o - | \ +# RUN: llvm-readobj -mips-abi-flags | FileCheck %s --check-prefix=CHECK-OBJ +# RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=asm -o - | \ +# RUN: FileCheck %s --check-prefix=CHECK-ASM + +# Test that the MT ASE flag in .MIPS.abiflags is _not_ set by .set. +# Test that '.set mt' is emitted by the asm target streamer. + +# CHECK-OBJ: ASEs +# CHECK-OBJ-NOT: MT (0x40) + +# CHECK-ASM: .set mt + .set mt + nop diff --git a/test/MC/Mips/mt/valid.s b/test/MC/Mips/mt/valid.s new file mode 100644 index 0000000000000..9fa07870a61f2 --- /dev/null +++ b/test/MC/Mips/mt/valid.s @@ -0,0 +1,33 @@ +# RUN: llvm-mc -arch=mips -mcpu=mips32r2 -mattr=+mt -show-encoding < %s \ +# RUN: | FileCheck %s + dmt # CHECK: dmt # encoding: [0x41,0x60,0x0b,0xc1] + dmt $5 # CHECK: dmt $5 # encoding: [0x41,0x65,0x0b,0xc1] + emt # CHECK: emt # encoding: [0x41,0x60,0x0b,0xe1] + emt $4 # CHECK: emt $4 # encoding: [0x41,0x64,0x0b,0xe1] + dvpe # CHECK: dvpe # encoding: [0x41,0x60,0x00,0x01] + dvpe $6 # CHECK: dvpe $6 # encoding: [0x41,0x66,0x00,0x01] + evpe # CHECK: evpe # encoding: [0x41,0x60,0x00,0x21] + evpe $4 # CHECK: evpe $4 # encoding: [0x41,0x64,0x00,0x21] + fork $2, $3, $5 # CHECK: fork $2, $3, $5 # encoding: [0x7c,0x65,0x10,0x08] + yield $4 # CHECK: yield $4 # encoding: [0x7c,0x80,0x00,0x09] + yield $4, $5 # CHECK: yield $4, $5 # encoding: [0x7c,0xa0,0x20,0x09] + mftr $4, $5, 0, 2, 0 # CHECK: mftr $4, $5, 0, 2, 0 # encoding: [0x41,0x05,0x20,0x02] + mftr $4, $5, 1, 0, 0 # CHECK: mftr $4, $5, 1, 0, 0 # encoding: [0x41,0x05,0x20,0x20] + mftr $4, $0, 1, 1, 0 # CHECK: mftr $4, $zero, 1, 1, 0 # encoding: [0x41,0x00,0x20,0x21] + mftr $4, $10, 1, 1, 0 # CHECK: mftr $4, $10, 1, 1, 0 # encoding: [0x41,0x0a,0x20,0x21] + mftr $4, $10, 1, 2, 0 # CHECK: mftr $4, $10, 1, 2, 0 # encoding: [0x41,0x0a,0x20,0x22] + mftr $4, $10, 1, 2, 1 # CHECK: mftr $4, $10, 1, 2, 1 # encoding: [0x41,0x0a,0x20,0x32] + mftr $4, $26, 1, 3, 0 # CHECK: mftr $4, $26, 1, 3, 0 # encoding: [0x41,0x1a,0x20,0x23] + mftr $4, $31, 1, 3, 0 # CHECK: mftr $4, $ra, 1, 3, 0 # encoding: [0x41,0x1f,0x20,0x23] + mftr $4, $14, 1, 4, 0 # CHECK: mftr $4, $14, 1, 4, 0 # encoding: [0x41,0x0e,0x20,0x24] + mftr $4, $15, 1, 5, 0 # CHECK: mftr $4, $15, 1, 5, 0 # encoding: [0x41,0x0f,0x20,0x25] + mttr $4, $5, 0, 2, 0 # CHECK: mttr $4, $5, 0, 2, 0 # encoding: [0x41,0x84,0x28,0x02] + mttr $4, $5, 1, 0, 0 # CHECK: mttr $4, $5, 1, 0, 0 # encoding: [0x41,0x84,0x28,0x20] + mttr $4, $0, 1, 1, 0 # CHECK: mttr $4, $zero, 1, 1, 0 # encoding: [0x41,0x84,0x00,0x21] + mttr $4, $10, 1, 1, 0 # CHECK: mttr $4, $10, 1, 1, 0 # encoding: [0x41,0x84,0x50,0x21] + mttr $4, $10, 1, 2, 0 # CHECK: mttr $4, $10, 1, 2, 0 # encoding: [0x41,0x84,0x50,0x22] + mttr $4, $10, 1, 2, 1 # CHECK: mttr $4, $10, 1, 2, 1 # encoding: [0x41,0x84,0x50,0x32] + mttr $4, $26, 1, 3, 0 # CHECK: mttr $4, $26, 1, 3, 0 # encoding: [0x41,0x84,0xd0,0x23] + mttr $4, $31, 1, 3, 0 # CHECK: mttr $4, $ra, 1, 3, 0 # encoding: [0x41,0x84,0xf8,0x23] + mttr $4, $14, 1, 4, 0 # CHECK: mttr $4, $14, 1, 4, 0 # encoding: [0x41,0x84,0x70,0x24] + mttr $4, $15, 1, 5, 0 # CHECK: mttr $4, $15, 1, 5, 0 # encoding: [0x41,0x84,0x78,0x25] diff --git a/test/MC/WebAssembly/array-fill.ll b/test/MC/WebAssembly/array-fill.ll new file mode 100644 index 0000000000000..4feabc0748e0f --- /dev/null +++ b/test/MC/WebAssembly/array-fill.ll @@ -0,0 +1,14 @@ +; RUN: llc -filetype=obj %s -o - | obj2yaml | FileCheck %s +; PR33624 + +source_filename = "ws.c" +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-unknown-wasm" + +%struct.bd = type { i8 } + +@gBd = hidden global [2 x %struct.bd] [%struct.bd { i8 1 }, %struct.bd { i8 2 }], align 1 + +; CHECK: - Type: DATA +; CHECK: Content: '0102' +; CHECK: DataSize: 2 diff --git a/test/MC/WebAssembly/external-data.ll b/test/MC/WebAssembly/external-data.ll index 6914736ac671a..b8c97453413e2 100644 --- a/test/MC/WebAssembly/external-data.ll +++ b/test/MC/WebAssembly/external-data.ll @@ -13,7 +13,8 @@ ; CHECK: Index: 0 ; CHECK: Offset: 0x0000000E ; CHECK: Segments: -; CHECK: - Index: 0 +; CHECK: - SectionOffset: 6 +; CHECK: MemoryIndex: 0 ; CHECK: Offset: ; CHECK: Opcode: I32_CONST ; CHECK: Value: 0 diff --git a/test/MC/WebAssembly/external-func-address.ll b/test/MC/WebAssembly/external-func-address.ll index 4022b2c9bae97..53da9805f9871 100644 --- a/test/MC/WebAssembly/external-func-address.ll +++ b/test/MC/WebAssembly/external-func-address.ll @@ -2,24 +2,33 @@ ; Verify that addresses of external functions generate correctly typed ; imports and relocations or type R_TABLE_INDEX_I32. -declare void @f1() #1 -@ptr_to_f1 = hidden global void ()* @f1, align 4 +declare void @f1(i32) #1 +@ptr_to_f1 = hidden global void (i32)* @f1, align 4 - -; CHECK: - Type: IMPORT -; CHECK: Imports: -; CHECK: - Module: env -; CHECK: Field: f1 -; CHECK: Kind: FUNCTION -; CHECK: SigIndex: 0 -; CHECK: - Type: ELEM -; CHECK: Segments: -; CHECK: - Offset: -; CHECK: Opcode: I32_CONST -; CHECK: Value: 0 -; CHECK: Functions: [ 0 ] -; CHECK: - Type: DATA -; CHECK: Relocations: -; CHECK: - Type: R_WEBASSEMBLY_TABLE_INDEX_I32 -; CHECK: Index: 0 -; CHECK: Offset: 0x00000006 +; CHECK: --- !WASM +; CHECK-NEXT: FileHeader: +; CHECK-NEXT: Version: 0x00000001 +; CHECK-NEXT: Sections: +; CHECK-NEXT: - Type: TYPE +; CHECK-NEXT: Signatures: +; CHECK-NEXT: - Index: 0 +; CHECK-NEXT: ReturnType: NORESULT +; CHECK-NEXT: ParamTypes: +; CHECK-NEXT: - I32 +; CHECK: - Type: IMPORT +; CHECK-NEXT: Imports: +; CHECK-NEXT: - Module: env +; CHECK-NEXT: Field: f1 +; CHECK-NEXT: Kind: FUNCTION +; CHECK-NEXT: SigIndex: 0 +; CHECK: - Type: ELEM +; CHECK-NEXT: Segments: +; CHECK-NEXT: - Offset: +; CHECK-NEXT: Opcode: I32_CONST +; CHECK-NEXT: Value: 0 +; CHECK-NEXT: Functions: [ 0 ] +; CHECK: - Type: DATA +; CHECK-NEXT: Relocations: +; CHECK-NEXT: - Type: R_WEBASSEMBLY_TABLE_INDEX_I32 +; CHECK-NEXT: Index: 0 +; CHECK-NEXT: Offset: 0x00000006 diff --git a/test/MC/WebAssembly/unnamed-data.ll b/test/MC/WebAssembly/unnamed-data.ll index fd985088c1d27..fa0ff966a79fd 100644 --- a/test/MC/WebAssembly/unnamed-data.ll +++ b/test/MC/WebAssembly/unnamed-data.ll @@ -46,7 +46,8 @@ ; CHECK-NEXT: Index: 1 ; CHECK-NEXT: Offset: 0x0000001E ; CHECK-NEXT: Segments: -; CHECK-NEXT: - Index: 0 +; CHECK-NEXT: - SectionOffset: 6 +; CHECK-NEXT: MemoryIndex: 0 ; CHECK-NEXT: Offset: ; CHECK-NEXT: Opcode: I32_CONST ; CHECK-NEXT: Value: 0 diff --git a/test/MC/WebAssembly/weak-alias.ll b/test/MC/WebAssembly/weak-alias.ll index 6e2b8631d2b17..1d80ea4aac6c1 100644 --- a/test/MC/WebAssembly/weak-alias.ll +++ b/test/MC/WebAssembly/weak-alias.ll @@ -3,27 +3,56 @@ ; foo_alias() function is weak alias of function foo() ; Generates two exports of the same function, one of them weak -@foo_alias = weak hidden alias i32 (...), bitcast (i32 ()* @foo to i32 (...)*) +@foo_alias = weak hidden alias i32 (), i32 ()* @foo + +define hidden i32 @call_alias() #0 { +entry: + %call = call i32 @foo_alias() + ret i32 %call +} define hidden i32 @foo() #0 { entry: ret i32 0 } + +; CHECK: - Type: TYPE +; CHECK-NEXT: Signatures: +; CHECK-NEXT: - Index: 0 +; CHECK-NEXT: ReturnType: I32 +; CHECK-NEXT: ParamTypes: + +; CHECK: - Type: IMPORT +; CHECK-NEXT: Imports: +; CHECK-NEXT: - Module: env +; CHECK-NEXT: Field: foo_alias +; CHECK-NEXT: Kind: FUNCTION +; CHECK-NEXT: SigIndex: 0 + +; CHECK: - Type: FUNCTION +; CHECK-NEXT: FunctionTypes: [ 0, 0 ] + ; CHECK: - Type: EXPORT ; CHECK-NEXT: Exports: +; CHECK-NEXT: - Name: call_alias +; CHECK-NEXT: Kind: FUNCTION +; CHECK-NEXT: Index: 1 ; CHECK-NEXT: - Name: foo ; CHECK-NEXT: Kind: FUNCTION -; CHECK-NEXT: Index: 0 +; CHECK-NEXT: Index: 2 ; CHECK-NEXT: - Name: foo_alias ; CHECK-NEXT: Kind: FUNCTION -; CHECK-NEXT: Index: 0 - +; CHECK-NEXT: Index: 2 ; CHECK: - Type: CUSTOM ; CHECK-NEXT: Name: name ; CHECK-NEXT: FunctionNames: ; CHECK-NEXT: - Index: 0 +; CHECK-NEXT: Name: foo_alias +; CHECK-NEXT: - Index: 1 +; CHECK-NEXT: Name: call_alias +; CHECK-NEXT: - Index: 2 ; CHECK-NEXT: Name: foo ; CHECK-NEXT: - Type: CUSTOM ; CHECK-NEXT: Name: linking diff --git a/test/Object/Inputs/trivial-object-test.wasm b/test/Object/Inputs/trivial-object-test.wasm Binary files differnew file mode 100644 index 0000000000000..1f3947ac472e0 --- /dev/null +++ b/test/Object/Inputs/trivial-object-test.wasm diff --git a/test/Object/Inputs/trivial.ll b/test/Object/Inputs/trivial.ll index 37a6bc20a8c2c..528a713c7fa3c 100644 --- a/test/Object/Inputs/trivial.ll +++ b/test/Object/Inputs/trivial.ll @@ -1,3 +1,6 @@ +; Input used for generating checked-in binaries (trivial-object-test.*) +; llc -mtriple=wasm32-unknown-unknown-wasm trivial.ll -filetype=obj -o trivial-object-test.wasm + @.str = private unnamed_addr constant [13 x i8] c"Hello World\0A\00", align 1 define i32 @main() nounwind { diff --git a/test/Object/nm-trivial-object.test b/test/Object/nm-trivial-object.test index c1f4d9e1f96ff..f1aadd5cccf59 100644 --- a/test/Object/nm-trivial-object.test +++ b/test/Object/nm-trivial-object.test @@ -2,6 +2,8 @@ RUN: yaml2obj %p/Inputs/COFF/i386.yaml | llvm-nm -a -S - \ RUN: | FileCheck %s -check-prefix COFF32 RUN: yaml2obj %p/Inputs/COFF/x86-64.yaml | llvm-nm -a -S - \ RUN: | FileCheck %s -check-prefix COFF64 +RUN: llvm-nm %p/Inputs/trivial-object-test.wasm \ +RUN: | FileCheck %s -check-prefix WASM RUN: llvm-nm %p/Inputs/trivial-object-test.elf-i386 \ RUN: | FileCheck %s -check-prefix ELF RUN: llvm-nm %p/Inputs/trivial-object-test.elf-i386 -S \ @@ -57,6 +59,11 @@ COFF32-NEXT: U _SomeOtherFunction COFF32-NEXT: 00000000 T _main COFF32-NEXT: U _puts +WASM: U SomeOtherFunction +WASM-NEXT: 00000002 T main +WASM-NEXT: U puts +WASM-NEXT: 00000001 D var + COFF64: 00000000 d .data COFF64-NEXT: 00000000 t .text COFF64-NEXT: 00000000 r ??__Ex@@YAXXZ diff --git a/test/Object/obj2yaml.test b/test/Object/obj2yaml.test index b89311db60697..73d466cc4993e 100644 --- a/test/Object/obj2yaml.test +++ b/test/Object/obj2yaml.test @@ -4,8 +4,8 @@ RUN: obj2yaml %p/Inputs/trivial-object-test.elf-mipsel | FileCheck %s --check-pr RUN: obj2yaml %p/Inputs/trivial-object-test.elf-mips64el | FileCheck %s --check-prefix ELF-MIPS64EL RUN: obj2yaml %p/Inputs/trivial-object-test.elf-x86-64 | FileCheck %s --check-prefix ELF-X86-64 RUN: obj2yaml %p/Inputs/trivial-object-test.elf-avr | FileCheck %s --check-prefix ELF-AVR -RUN: obj2yaml %p/Inputs/unwind-section.elf-x86-64 \ -RUN: | FileCheck %s --check-prefix ELF-X86-64-UNWIND +RUN: obj2yaml %p/Inputs/trivial-object-test.wasm | FileCheck %s --check-prefix WASM +RUN: obj2yaml %p/Inputs/unwind-section.elf-x86-64 | FileCheck %s --check-prefix ELF-X86-64-UNWIND COFF-I386: header: COFF-I386-NEXT: Machine: IMAGE_FILE_MACHINE_I386 @@ -411,13 +411,13 @@ ELF-X86-64-NEXT: - Name: SomeOtherFunction ELF-X86-64-NEXT: - Name: puts -ELF-AVR: FileHeader: +ELF-AVR: FileHeader: ELF-AVR-NEXT: Class: ELFCLASS32 ELF-AVR-NEXT: Data: ELFDATA2LSB ELF-AVR-NEXT: Type: ET_EXEC ELF-AVR-NEXT: Machine: EM_AVR ELF-AVR-NEXT: Flags: [ EF_AVR_ARCH_AVR2 ] -ELF-AVR-NEXT: Sections: +ELF-AVR-NEXT: Sections: ELF-AVR-NEXT: - Name: .text ELF-AVR-NEXT: Type: SHT_PROGBITS ELF-AVR-NEXT: Flags: [ SHF_ALLOC, SHF_EXECINSTR ] @@ -429,8 +429,8 @@ ELF-AVR-NEXT: Flags: [ SHF_WRITE, SHF_ALLOC ] ELF-AVR-NEXT: Address: 0x0000000000800060 ELF-AVR-NEXT: AddressAlign: 0x0000000000000001 ELF-AVR-NEXT: Content: '' -ELF-AVR-NEXT: Symbols: -ELF-AVR-NEXT: Local: +ELF-AVR-NEXT: Symbols: +ELF-AVR-NEXT: Local: ELF-AVR-NEXT: - Type: STT_SECTION ELF-AVR-NEXT: Section: .text ELF-AVR-NEXT: - Type: STT_SECTION @@ -440,7 +440,7 @@ ELF-AVR-NEXT: - Name: a.o ELF-AVR-NEXT: Type: STT_FILE ELF-AVR-NEXT: - Name: main ELF-AVR-NEXT: Section: .text -ELF-AVR-NEXT: Global: +ELF-AVR-NEXT: Global: ELF-AVR-NEXT: - Name: __trampolines_start ELF-AVR-NEXT: Section: .text ELF-AVR-NEXT: - Name: _etext @@ -470,6 +470,17 @@ ELF-AVR-NEXT: - Name: _end ELF-AVR-NEXT: Section: .data ELF-AVR-NEXT: Value: 0x0000000000800060 +WASM: --- !WASM +WASM-NEXT: FileHeader: +WASM-NEXT: Version: 0x00000001 +WASM: - Type: EXPORT +WASM-NEXT: Exports: +WASM-NEXT: - Name: main +WASM-NEXT: Kind: FUNCTION +WASM-NEXT: Index: 2 +WASM-NEXT: - Name: var +WASM-NEXT: Kind: GLOBAL +WASM-NEXT: Index: 1 ELF-X86-64-UNWIND: - Name: .eh_frame ELF-X86-64-UNWIND-NEXT: Type: SHT_X86_64_UNWIND diff --git a/test/Object/objdump-relocations.test b/test/Object/objdump-relocations.test index 1e41f78ca729e..29f0019628758 100644 --- a/test/Object/objdump-relocations.test +++ b/test/Object/objdump-relocations.test @@ -12,6 +12,8 @@ RUN: llvm-objdump -r %p/Inputs/trivial-object-test.elf-mips64el \ RUN: | FileCheck %s -check-prefix ELF-MIPS64EL RUN: llvm-objdump -r %p/Inputs/trivial-object-test.elf-mipsel \ RUN: | FileCheck %s -check-prefix ELF-MIPSEL +RUN: llvm-objdump -r %p/Inputs/trivial-object-test.wasm \ +RUN: | FileCheck %s -check-prefix WASM RUN: llvm-objdump -r %p/Inputs/relocations.elf-x86-64 \ RUN: | FileCheck %s -check-prefix ELF-complex-x86-64 @@ -57,6 +59,11 @@ ELF-MIPSEL: R_MIPS_LO16 $.str ELF-MIPSEL: R_MIPS_CALL16 puts ELF-MIPSEL: R_MIPS_CALL16 SomeOtherFunction +WASM: CODE +WASM-NEXT: R_WEBASSEMBLY_GLOBAL_ADDR_SLEB 0+0 +WASM-NEXT: R_WEBASSEMBLY_FUNCTION_INDEX_LEB 0+0 +WASM-NEXT: R_WEBASSEMBLY_FUNCTION_INDEX_LEB 1+0 + ELF-complex-x86-64: .text ELF-complex-x86-64-NEXT: R_X86_64_8 .data-4 ELF-complex-x86-64-NEXT: R_X86_64_16 .data-4 diff --git a/test/ObjectYAML/wasm/data_section.yaml b/test/ObjectYAML/wasm/data_section.yaml index b8c65abbff912..521aa54027841 100644 --- a/test/ObjectYAML/wasm/data_section.yaml +++ b/test/ObjectYAML/wasm/data_section.yaml @@ -8,7 +8,7 @@ Sections: - Initial: 0x00000003 - Type: DATA Segments: - - Index: 0 + - MemoryIndex: 0 Offset: Opcode: I32_CONST Value: 4 @@ -38,7 +38,8 @@ Sections: # CHECK-NEXT: Offset: 0x00000006 # CHECK-NEXT: Addend: -6 # CHECK-NEXT: Segments: -# CHECK-NEXT: - Index: 0 +# CHECK-NEXT: - SectionOffset: 6 +# CHECK-NEXT: MemoryIndex: 0 # CHECK-NEXT: Offset: # CHECK-NEXT: Opcode: I32_CONST # CHECK-NEXT: Value: 4 diff --git a/test/Other/2002-01-31-CallGraph.ll b/test/Other/2002-01-31-CallGraph.ll index 0e4c877512631..d4819357ac67e 100644 --- a/test/Other/2002-01-31-CallGraph.ll +++ b/test/Other/2002-01-31-CallGraph.ll @@ -1,6 +1,6 @@ ; Call graph construction crash: Not handling indirect calls right ; -; RUN: opt < %s -analyze -print-callgraph >& /dev/null +; RUN: opt < %s -analyze -print-callgraph > /dev/null 2>&1 ; %FunTy = type i32 (i32) diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll index fbecb34aa4b7c..a0658c10d6098 100644 --- a/test/Other/new-pm-defaults.ll +++ b/test/Other/new-pm-defaults.ll @@ -26,6 +26,37 @@ ; RUN: -passes='lto-pre-link<O2>' -S %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O2 +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes-ep-peephole='no-op-function' \ +; RUN: -passes='default<O3>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \ +; RUN: --check-prefix=CHECK-EP-PEEPHOLE +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes-ep-late-loop-optimizations='no-op-loop' \ +; RUN: -passes='default<O3>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \ +; RUN: --check-prefix=CHECK-EP-LOOP-LATE +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes-ep-loop-optimizer-end='no-op-loop' \ +; RUN: -passes='default<O3>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \ +; RUN: --check-prefix=CHECK-EP-LOOP-END +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes-ep-scalar-optimizer-late='no-op-function' \ +; RUN: -passes='default<O3>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \ +; RUN: --check-prefix=CHECK-EP-SCALAR-LATE +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes-ep-cgscc-optimizer-late='no-op-cgscc' \ +; RUN: -passes='default<O3>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \ +; RUN: --check-prefix=CHECK-EP-CGSCC-LATE +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes-ep-vectorizer-start='no-op-function' \ +; RUN: -passes='default<O3>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \ +; RUN: --check-prefix=CHECK-EP-VECTORIZER-START + ; CHECK-O: Starting llvm::Module pass manager run. ; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> ; CHECK-O-NEXT: Starting llvm::Module pass manager run. @@ -53,6 +84,7 @@ ; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> ; CHECK-O-NEXT: Starting llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA @@ -84,6 +116,7 @@ ; CHECK-O1-NEXT: Running pass: LibCallsShrinkWrapPass ; CHECK-O2-NEXT: Running pass: LibCallsShrinkWrapPass ; CHECK-O3-NEXT: Running pass: LibCallsShrinkWrapPass +; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass ; CHECK-O-NEXT: Running pass: TailCallElimPass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: ReassociatePass @@ -105,8 +138,10 @@ ; CHECK-O-NEXT: Starting Loop pass manager run. ; CHECK-O-NEXT: Running pass: IndVarSimplifyPass ; CHECK-O-NEXT: Running pass: LoopIdiomRecognizePass +; CHECK-EP-LOOP-LATE-NEXT: Running pass: NoOpLoopPass ; CHECK-O-NEXT: Running pass: LoopDeletionPass ; CHECK-O-NEXT: Running pass: LoopUnrollPass +; CHECK-EP-LOOP-END-NEXT: Running pass: NoOpLoopPass ; CHECK-O-NEXT: Finished Loop pass manager run. ; CHECK-Os-NEXT: Running pass: MergedLoadStoreMotionPass ; CHECK-Os-NEXT: Running pass: GVN @@ -126,15 +161,19 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass ; CHECK-O-NEXT: Running pass: JumpThreadingPass ; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass ; CHECK-O-NEXT: Running pass: DSEPass ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass{{.*}}> +; CHECK-EP-SCALAR-LATE-NEXT: Running pass: NoOpFunctionPass ; CHECK-O-NEXT: Running pass: ADCEPass ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. +; CHECK-EP-CGSCC-LATE-NEXT: Running pass: NoOpCGSCCPass ; CHECK-O-NEXT: Finished CGSCC pass manager run. ; CHECK-O-NEXT: Finished llvm::Module pass manager run. ; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> @@ -146,6 +185,7 @@ ; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> ; CHECK-O-NEXT: Starting llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: Float2IntPass +; CHECK-EP-VECTORIZER-START-NEXT: Running pass: NoOpFunctionPass ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopRotatePass ; CHECK-O-NEXT: Running pass: LoopDistributePass ; CHECK-O-NEXT: Running pass: LoopVectorizePass diff --git a/test/Other/new-pm-lto-defaults.ll b/test/Other/new-pm-lto-defaults.ll index dfd2983532729..cab3965bf18fd 100644 --- a/test/Other/new-pm-lto-defaults.ll +++ b/test/Other/new-pm-lto-defaults.ll @@ -17,6 +17,10 @@ ; RUN: opt -disable-verify -debug-pass-manager \ ; RUN: -passes='lto<Oz>' -S %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O2 +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='lto<O3>' -S %s -passes-ep-peephole='no-op-function' 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O2 \ +; RUN: --check-prefix=CHECK-EP-Peephole ; CHECK-O: Starting llvm::Module pass manager run. ; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module @@ -45,13 +49,18 @@ ; CHECK-O2-NEXT: Running analysis: AssumptionAnalysis ; CHECK-O2-NEXT: Running pass: ConstantMergePass ; CHECK-O2-NEXT: Running pass: DeadArgumentEliminationPass -; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}InstCombinePass> +; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> +; CHECK-O2-NEXT: Starting llvm::Function pass manager run. +; CHECK-O2-NEXT: Running pass: InstCombinePass +; CHECK-EP-Peephole-NEXT: Running pass: NoOpFunctionPass +; CHECK-O2-NEXT: Finished llvm::Function pass manager run. ; CHECK-O2-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}InlinerPass> ; CHECK-O2-NEXT: Running pass: GlobalOptPass ; CHECK-O2-NEXT: Running pass: GlobalDCEPass ; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> ; CHECK-O2-NEXT: Starting llvm::Function pass manager run. ; CHECK-O2-NEXT: Running pass: InstCombinePass +; CHECK-EP-Peephole-NEXT: Running pass: NoOpFunctionPass ; CHECK-O2-NEXT: Running pass: JumpThreadingPass ; CHECK-O2-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O2-NEXT: Running pass: SROA on foo diff --git a/test/Other/pass-pipelines.ll b/test/Other/pass-pipelines.ll index 971ed2c094730..d47c02ee7a469 100644 --- a/test/Other/pass-pipelines.ll +++ b/test/Other/pass-pipelines.ll @@ -24,7 +24,7 @@ ; CHECK-O2: Dead Argument Elimination ; CHECK-O2-NEXT: FunctionPass Manager ; CHECK-O2-NOT: Manager -; Very carefully asert the CGSCC pass pipeline as it is fragile and unusually +; Very carefully assert the CGSCC pass pipeline as it is fragile and unusually ; susceptible to phase ordering issues. ; CHECK-O2: CallGraph Construction ; CHECK-O2-NEXT: Globals Alias Analysis diff --git a/test/SafepointIRVerifier/basic-use-after-reloc.ll b/test/SafepointIRVerifier/basic-use-after-reloc.ll new file mode 100644 index 0000000000000..4b0746c9f5275 --- /dev/null +++ b/test/SafepointIRVerifier/basic-use-after-reloc.ll @@ -0,0 +1,23 @@ +; RUN: opt -safepoint-ir-verifier-print-only -verify-safepoint-ir -S %s 2>&1 | FileCheck %s + +; This test checks that if a value is used immediately after a +; safepoint without using the relocated value that the verifier +; catches this. + +%jObject = type { [8 x i8] } + +; Function Attrs: nounwind +define %jObject addrspace(1)* @test(%jObject addrspace(1)* %arg) gc "statepoint-example" { +bci_0: + %safepoint_token3 = tail call token (i64, i32, double (double)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f64f64f(i64 0, i32 0, double (double)* undef, i32 1, i32 0, double undef, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, %jObject addrspace(1)* %arg) + %arg2.relocated4 = call coldcc %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token %safepoint_token3, i32 13, i32 13) + ret %jObject addrspace(1)* %arg +; CHECK: Illegal use of unrelocated value found! +; CHECK-NEXT: Def: %jObject addrspace(1)* %arg +; CHECK-NEXT: Use: ret %jObject addrspace(1)* %arg +} + +; Function Attrs: nounwind +declare %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token, i32, i32) #3 + +declare token @llvm.experimental.gc.statepoint.p0f_f64f64f(i64, i32, double (double)*, i32, i32, ...) diff --git a/test/SafepointIRVerifier/compares.ll b/test/SafepointIRVerifier/compares.ll new file mode 100644 index 0000000000000..a14fc44e9814c --- /dev/null +++ b/test/SafepointIRVerifier/compares.ll @@ -0,0 +1,85 @@ +; RUN: opt -safepoint-ir-verifier-print-only -verify-safepoint-ir -S %s 2>&1 | FileCheck %s + +; In some cases, it is valid to have unrelocated pointers used as compare +; operands. Make sure the verifier knows to spot these exceptions. + + +; comparison against null. +define i8 addrspace(1)* @test1(i64 %arg, i8 addrspace(1)* %addr) gc "statepoint-example" { +; CHECK: No illegal uses found by SafepointIRVerifier in: test1 +entry: + %load_addr = getelementptr i8, i8 addrspace(1)* %addr, i64 %arg + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0) + %cmp = icmp eq i8 addrspace(1)* %load_addr, null + ret i8 addrspace(1)* null +} + +; comparison against exclusively derived null. +define void @test2(i64 %arg, i1 %cond, i8 addrspace(1)* %addr) gc "statepoint-example" { +; CHECK: No illegal uses found by SafepointIRVerifier in: test2 + %load_addr = getelementptr i8, i8 addrspace(1)* null, i64 %arg + %load_addr_sel = select i1 %cond, i8 addrspace(1)* null, i8 addrspace(1)* %load_addr + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0) + %cmp = icmp eq i8 addrspace(1)* %addr, %load_addr_sel + ret void +} + +; comparison against a constant non-null pointer. This is unrelocated use, since +; that pointer bits may mean something in a VM. +define void @test3(i64 %arg, i32 addrspace(1)* %addr) gc "statepoint-example" { +; CHECK-LABEL: Verifying gc pointers in function: test3 +; CHECK: Illegal use of unrelocated value found! +entry: + %load_addr = getelementptr i32, i32 addrspace(1)* %addr, i64 %arg + %load_addr_const = getelementptr i32, i32 addrspace(1)* inttoptr (i64 15 to i32 addrspace(1)*), i64 %arg + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0) + %cmp = icmp eq i32 addrspace(1)* %load_addr, %load_addr_const + ret void +} + +; comparison against a derived pointer that is *not* exclusively derived from +; null. An unrelocated use since the derived pointer could be from the constant +; non-null pointer (load_addr.2). +define void @test4(i64 %arg, i1 %cond, i8 addrspace(1)* %base) gc "statepoint-example" { +; CHECK-LABEL: Verifying gc pointers in function: test4 +; CHECK: Illegal use of unrelocated value found! +entry: + %load_addr.1 = getelementptr i8, i8 addrspace(1)* null, i64 %arg + br i1 %cond, label %split, label %join + +split: + %load_addr.2 = getelementptr i8, i8 addrspace(1)* inttoptr (i64 30 to i8 addrspace(1)*), i64 %arg + br label %join + +join: + %load_addr = phi i8 addrspace(1)* [%load_addr.1, %entry], [%load_addr.2, %split] + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0) + %cmp = icmp eq i8 addrspace(1)* %load_addr, %base + ret void +} + +; comparison between 2 unrelocated base pointers. +; Since the cmp can be reordered legally before the safepoint, these are correct +; unrelocated uses of the pointers. +define void @test5(i64 %arg, i8 addrspace(1)* %base1, i8 addrspace(1)* %base2) gc "statepoint-example" { +; CHECK: No illegal uses found by SafepointIRVerifier in: test5 + %load_addr1 = getelementptr i8, i8 addrspace(1)* %base1, i64 %arg + %load_addr2 = getelementptr i8, i8 addrspace(1)* %base2, i64 %arg + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0) + %cmp = icmp eq i8 addrspace(1)* %load_addr1, %load_addr2 + ret void +} + +; comparison between a relocated and an unrelocated pointer. +; this is invalid use of the unrelocated pointer. +define void @test6(i64 %arg, i8 addrspace(1)* %base1, i8 addrspace(1)* %base2) gc "statepoint-example" { +; CHECK-LABEL: Verifying gc pointers in function: test6 +; CHECK: Illegal use of unrelocated value found! + %load_addr1 = getelementptr i8, i8 addrspace(1)* %base1, i64 %arg + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %base2 , i32 -1, i32 0, i32 0, i32 0) + %ptr2.relocated = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %safepoint_token, i32 7, i32 7) ; base2, base2 + %cmp = icmp eq i8 addrspace(1)* %load_addr1, %ptr2.relocated + ret void +} +declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...) +declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32) diff --git a/test/SafepointIRVerifier/constant-bases.ll b/test/SafepointIRVerifier/constant-bases.ll new file mode 100644 index 0000000000000..52a2a46d068d0 --- /dev/null +++ b/test/SafepointIRVerifier/constant-bases.ll @@ -0,0 +1,70 @@ +; RUN: opt -safepoint-ir-verifier-print-only -verify-safepoint-ir -S %s 2>&1 | FileCheck %s + +define i8 addrspace(1)* @test1(i64 %arg) gc "statepoint-example" { +; CHECK: No illegal uses found by SafepointIRVerifier in: test1 +entry: + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0) + ret i8 addrspace(1)* null +} + +define i8 addrspace(1)* @test2(i64 %arg) gc "statepoint-example" { +; CHECK: No illegal uses found by SafepointIRVerifier in: test2 +entry: + %load_addr = getelementptr i8, i8 addrspace(1)* inttoptr (i64 15 to i8 addrspace(1)*), i64 %arg + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0) + ret i8 addrspace(1)* %load_addr +} + +define i8 addrspace(1)* @test3(i64 %arg) gc "statepoint-example" { +; CHECK: No illegal uses found by SafepointIRVerifier in: test3 +entry: + %load_addr = getelementptr i32, i32 addrspace(1)* inttoptr (i64 15 to i32 addrspace(1)*), i64 %arg + %load_addr.cast = bitcast i32 addrspace(1)* %load_addr to i8 addrspace(1)* + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0) + ret i8 addrspace(1)* %load_addr.cast +} + +define i8 addrspace(1)* @test4(i64 %arg, i1 %cond) gc "statepoint-example" { +; CHECK: No illegal uses found by SafepointIRVerifier in: test4 +entry: + %load_addr.1 = getelementptr i8, i8 addrspace(1)* inttoptr (i64 15 to i8 addrspace(1)*), i64 %arg + br i1 %cond, label %split, label %join + +split: + %load_addr.2 = getelementptr i8, i8 addrspace(1)* inttoptr (i64 30 to i8 addrspace(1)*), i64 %arg + br label %join + +join: + %load_addr = phi i8 addrspace(1)* [%load_addr.1, %entry], [%load_addr.2, %split] + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0) + ret i8 addrspace(1)* %load_addr +} + +define i8 addrspace(1)* @test5(i64 %arg, i1 %cond) gc "statepoint-example" { +; CHECK: No illegal uses found by SafepointIRVerifier in: test5 +entry: + %load_addr.1 = getelementptr i8, i8 addrspace(1)* inttoptr (i64 15 to i8 addrspace(1)*), i64 %arg + %load_addr.2 = getelementptr i8, i8 addrspace(1)* inttoptr (i64 30 to i8 addrspace(1)*), i64 %arg + %load_addr = select i1 %cond, i8 addrspace(1)* %load_addr.1, i8 addrspace(1)* %load_addr.2 + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0) + ret i8 addrspace(1)* %load_addr +} + +define i8 addrspace(1)* @test6(i64 %arg, i1 %cond, i8 addrspace(1)* %base) gc "statepoint-example" { +; CHECK-LABEL: Verifying gc pointers in function: test6 +; CHECK: Illegal use of unrelocated value found! +entry: + %load_addr.1 = getelementptr i8, i8 addrspace(1)* %base, i64 %arg + br i1 %cond, label %split, label %join + +split: + %load_addr.2 = getelementptr i8, i8 addrspace(1)* inttoptr (i64 30 to i8 addrspace(1)*), i64 %arg + br label %join + +join: + %load_addr = phi i8 addrspace(1)* [%load_addr.1, %entry], [%load_addr.2, %split] + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0) + ret i8 addrspace(1)* %load_addr +} + +declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...) diff --git a/test/SafepointIRVerifier/unrecorded-live-at-sp.ll b/test/SafepointIRVerifier/unrecorded-live-at-sp.ll new file mode 100644 index 0000000000000..e3f21c3e7133a --- /dev/null +++ b/test/SafepointIRVerifier/unrecorded-live-at-sp.ll @@ -0,0 +1,71 @@ +; RUN: opt %s -safepoint-ir-verifier-print-only -verify-safepoint-ir -S 2>&1 | FileCheck %s + +; CHECK: Illegal use of unrelocated value found! +; CHECK-NEXT: Def: %base_phi3 = phi %jObject addrspace(1)* [ %obj609.relocated, %not_zero146 ], [ %base_phi2, %bci_37-aload ], !is_base_value !0 +; CHECK-NEXT: Use: %base_phi2 = phi %jObject addrspace(1)* [ %base_phi3, %not_zero179 ], [ %cast5, %bci_0 ], !is_base_value !0 + +%jObject = type { [8 x i8] } + +declare %jObject addrspace(1)* @generate_obj1() #1 + +declare %jObject addrspace(1)* addrspace(1)* @generate_obj2() #1 + +declare %jObject addrspace(1)* @generate_obj3() #1 + +; Function Attrs: nounwind +define void @test(%jObject addrspace(1)*, %jObject addrspace(1)*, i32) #3 gc "statepoint-example" { +bci_0: + %result608 = call %jObject addrspace(1)* @generate_obj3() + %obj609 = bitcast %jObject addrspace(1)* %result608 to %jObject addrspace(1)* + %cast = bitcast %jObject addrspace(1)* %result608 to %jObject addrspace(1)* + %cast5 = bitcast %jObject addrspace(1)* %result608 to %jObject addrspace(1)* + br label %bci_37-aload + +bci_37-aload: ; preds = %not_zero179, %bci_0 + %base_phi = phi %jObject addrspace(1)* [ %base_phi1.relocated, %not_zero179 ], [ %cast, %bci_0 ], !is_base_value !0 + %base_phi2 = phi %jObject addrspace(1)* [ %base_phi3, %not_zero179 ], [ %cast5, %bci_0 ], !is_base_value !0 + %relocated8 = phi %jObject addrspace(1)* [ %relocated7.relocated, %not_zero179 ], [ %obj609, %bci_0 ] + %tmp3 = getelementptr inbounds %jObject, %jObject addrspace(1)* %relocated8, i64 0, i32 0, i64 32 + %addr98 = bitcast i8 addrspace(1)* %tmp3 to %jObject addrspace(1)* addrspace(1)* + %cast6 = bitcast %jObject addrspace(1)* %base_phi2 to %jObject addrspace(1)* addrspace(1)* + br i1 undef, label %not_zero179, label %not_zero146 + +not_zero146: ; preds = %bci_37-aload + %addr98.relocated = call %jObject addrspace(1)* addrspace(1)* @generate_obj2() #1 + %obj609.relocated = call %jObject addrspace(1)* @generate_obj1() #1 + br label %not_zero179 + +not_zero179: ; preds = %not_zero146, %bci_37-aload + %base_phi1 = phi %jObject addrspace(1)* [ %obj609.relocated, %not_zero146 ], [ %base_phi, %bci_37-aload ], !is_base_value !0 + %base_phi3 = phi %jObject addrspace(1)* [ %obj609.relocated, %not_zero146 ], [ %base_phi2, %bci_37-aload ], !is_base_value !0 + %relocated7 = phi %jObject addrspace(1)* [ %obj609.relocated, %not_zero146 ], [ %relocated8, %bci_37-aload ] + %base_phi4 = phi %jObject addrspace(1)* addrspace(1)* [ %addr98.relocated, %not_zero146 ], [ %cast6, %bci_37-aload ], !is_base_value !0 + %relocated4 = phi %jObject addrspace(1)* addrspace(1)* [ %addr98.relocated, %not_zero146 ], [ %addr98, %bci_37-aload ] + %safepoint_token = tail call token (i64, i32, i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i64 0, i32 0, i32 ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, %jObject addrspace(1)* %base_phi1, %jObject addrspace(1)* addrspace(1)* %base_phi4, %jObject addrspace(1)* addrspace(1)* %relocated4, %jObject addrspace(1)* %relocated7) + %tmp4 = call i32 @llvm.experimental.gc.result.i32(token %safepoint_token) + %base_phi1.relocated = call coldcc %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token %safepoint_token, i32 12, i32 12) + %base_phi4.relocated = call coldcc %jObject addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1jObject(token %safepoint_token, i32 13, i32 13) + %relocated4.relocated = call coldcc %jObject addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1jObject(token %safepoint_token, i32 13, i32 14) + %relocated7.relocated = call coldcc %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token %safepoint_token, i32 12, i32 15) + %addr636 = bitcast %jObject addrspace(1)* addrspace(1)* %relocated4.relocated to %jObject addrspace(1)* addrspace(1)* + br label %bci_37-aload +} + +declare token @llvm.experimental.gc.statepoint.p0f_i32f(i64, i32, i32 ()*, i32, i32, ...) + +; Function Attrs: nounwind +declare i32 @llvm.experimental.gc.result.i32(token) #4 + +; Function Attrs: nounwind +declare %jObject addrspace(1)* @llvm.experimental.gc.relocate.p1jObject(token, i32, i32) #4 + +; Function Attrs: nounwind +declare %jObject addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1jObject(token, i32, i32) #4 + +attributes #0 = { noinline nounwind "gc-leaf-function"="true" } +attributes #1 = { "gc-leaf-function"="true" } +attributes #2 = { nounwind readonly "gc-leaf-function"="true" } +attributes #3 = { nounwind } +attributes #4 = { nounwind } + +!0 = !{i32 1} diff --git a/test/SafepointIRVerifier/uses-in-phi-nodes.ll b/test/SafepointIRVerifier/uses-in-phi-nodes.ll new file mode 100644 index 0000000000000..d06eb6e0d9a7c --- /dev/null +++ b/test/SafepointIRVerifier/uses-in-phi-nodes.ll @@ -0,0 +1,78 @@ +; RUN: opt -safepoint-ir-verifier-print-only -verify-safepoint-ir -S %s 2>&1 | FileCheck %s + +define i8 addrspace(1)* @test.not.ok.0(i8 addrspace(1)* %arg) gc "statepoint-example" { +; CHECK-LABEL: Verifying gc pointers in function: test.not.ok.0 + bci_0: + br i1 undef, label %left, label %right + + left: + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0) + br label %merge + + right: + br label %merge + + merge: +; CHECK: Illegal use of unrelocated value found! +; CHECK-NEXT: Def: i8 addrspace(1)* %arg +; CHECK-NEXT: Use: %val = phi i8 addrspace(1)* [ %arg, %left ], [ %arg, %right ] + %val = phi i8 addrspace(1)* [ %arg, %left ], [ %arg, %right] + ret i8 addrspace(1)* %val +} + +define i8 addrspace(1)* @test.not.ok.1(i8 addrspace(1)* %arg) gc "statepoint-example" { +; CHECK-LABEL: Verifying gc pointers in function: test.not.ok.1 + bci_0: + br i1 undef, label %left, label %right + + left: + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0) + br label %merge + + right: + br label %merge + + merge: +; CHECK: Illegal use of unrelocated value found! +; CHECK-NEXT: Def: i8 addrspace(1)* %arg +; CHECK-NEXT: Use: %val = phi i8 addrspace(1)* [ %arg, %left ], [ null, %right ] + %val = phi i8 addrspace(1)* [ %arg, %left ], [ null, %right] + ret i8 addrspace(1)* %val +} + +define i8 addrspace(1)* @test.ok.0(i8 addrspace(1)* %arg) gc "statepoint-example" { +; CHECK: No illegal uses found by SafepointIRVerifier in: test.ok.0 + bci_0: + br i1 undef, label %left, label %right + + left: + %safepoint_token = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* undef, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0) + br label %merge + + right: + br label %merge + + merge: + %val = phi i8 addrspace(1)* [ null, %left ], [ null, %right] + ret i8 addrspace(1)* %val +} + +define i8 addrspace(1)* @test.ok.1(i8 addrspace(1)* %arg) gc "statepoint-example" { +; CHECK: No illegal uses found by SafepointIRVerifier in: test.ok.1 + bci_0: + br i1 undef, label %left, label %right + + left: + call void @not_statepoint() + br label %merge + + right: + br label %merge + + merge: + %val = phi i8 addrspace(1)* [ %arg, %left ], [ %arg, %right] + ret i8 addrspace(1)* %val +} + +declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...) +declare void @not_statepoint() diff --git a/test/TableGen/AsmVariant.td b/test/TableGen/AsmVariant.td index cb5d32385d3b3..70d410ee7bd06 100644 --- a/test/TableGen/AsmVariant.td +++ b/test/TableGen/AsmVariant.td @@ -31,6 +31,7 @@ def foo : Instruction { let InOperandList = (ins); let AsmString = "foo"; let AsmVariantName = "Foo"; + let Namespace = "Arch"; } def BarAlias : InstAlias<"bar", (foo)> { diff --git a/test/TableGen/GlobalISelEmitter.td b/test/TableGen/GlobalISelEmitter.td index 7c09b97a5e998..114d0e23b855b 100644 --- a/test/TableGen/GlobalISelEmitter.td +++ b/test/TableGen/GlobalISelEmitter.td @@ -7,6 +7,10 @@ include "llvm/Target/Target.td" def MyTargetISA : InstrInfo; def MyTarget : Target { let InstructionSet = MyTargetISA; } +let TargetPrefix = "mytarget" in { +def int_mytarget_nop : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; +} + def R0 : Register<"r0"> { let Namespace = "MyTarget"; } def GPR32 : RegisterClass<"MyTarget", [i32], 32, (add R0)>; def GPR32Op : RegisterOperand<GPR32>; @@ -38,6 +42,23 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; } //===- Test the function boilerplate. -------------------------------------===// +// CHECK: const unsigned MAX_SUBTARGET_PREDICATES = 3; +// CHECK: using PredicateBitset = llvm::PredicateBitsetImpl<MAX_SUBTARGET_PREDICATES>; + +// CHECK-LABEL: #ifdef GET_GLOBALISEL_TEMPORARIES_DECL +// CHECK-NEXT: mutable MatcherState State; +// CHECK-NEXT: typedef ComplexRendererFn(MyTargetInstructionSelector::*ComplexMatcherMemFn)(MachineOperand &) const; +// CHECK-NEXT: const MatcherInfoTy<PredicateBitset, ComplexMatcherMemFn> MatcherInfo; +// CHECK-NEXT: #endif // ifdef GET_GLOBALISEL_TEMPORARIES_DECL + +// CHECK-LABEL: #ifdef GET_GLOBALISEL_TEMPORARIES_INIT +// CHECK-NEXT: , State(2), +// CHECK-NEXT: MatcherInfo({TypeObjects, FeatureBitsets, { +// CHECK-NEXT: nullptr, // GICP_Invalid +// CHECK-NEXT: &MyTargetInstructionSelector::selectComplexPattern, // gi_complex +// CHECK-NEXT: }}) +// CHECK-NEXT: #endif // ifdef GET_GLOBALISEL_TEMPORARIES_INIT + // CHECK-LABEL: enum SubtargetFeatureBits : uint8_t { // CHECK-NEXT: Feature_HasABit = 0, // CHECK-NEXT: Feature_HasBBit = 1, @@ -63,39 +84,104 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; } // CHECK-NEXT: } // CHECK: bool MyTargetInstructionSelector::selectImpl(MachineInstr &I) const { -// CHECK: MachineFunction &MF = *I.getParent()->getParent(); -// CHECK: const MachineRegisterInfo &MRI = MF.getRegInfo(); +// CHECK-NEXT: MachineFunction &MF = *I.getParent()->getParent(); +// CHECK-NEXT: MachineRegisterInfo &MRI = MF.getRegInfo(); +// CHECK: AvailableFunctionFeatures = computeAvailableFunctionFeatures(&STI, &MF); +// CHECK-NEXT: const PredicateBitset AvailableFeatures = getAvailableFeatures(); +// CHECK-NEXT: NewMIVector OutMIs; +// CHECK-NEXT: State.MIs.clear(); +// CHECK-NEXT: State.MIs.push_back(&I); + +//===- Test a pattern with multiple ComplexPatterns in multiple instrs ----===// +// + +// CHECK-LABEL: MatchTable0[] = { +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/4, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_SELECT, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src1 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src2 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckComplexPattern, /*MI*/0, /*Op*/2, /*Renderer*/0, GICP_gi_complex, +// CHECK-NEXT: // MIs[0] src3 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/3, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckComplexPattern, /*MI*/0, /*Op*/3, /*Renderer*/1, GICP_gi_complex, +// CHECK-NEXT: // (select:i32 GPR32:i32:$src1, complex:i32:$src2, complex:i32:$src3) => (INSN2:i32 GPR32:i32:$src1, complex:i32:$src3, complex:i32:$src2) +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::INSN2, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1 +// CHECK-NEXT: GIR_ComplexRenderer, /*InsnID*/0, /*RendererID*/1, +// CHECK-NEXT: GIR_ComplexRenderer, /*InsnID*/0, /*RendererID*/0, +// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable0\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable0, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } + +def INSN3 : I<(outs GPR32:$dst), + (ins GPR32Op:$src1, complex:$src2, GPR32:$src3, complex:$src4, complex:$src5), []>; +def : Pat<(select GPR32:$src1, complex:$src2, (select GPR32:$src3, complex:$src4, complex:$src5)), + (INSN3 GPR32:$src1, complex:$src2, GPR32:$src3, complex:$src4, complex:$src5)>; //===- Test a pattern with multiple ComplexPattern operands. --------------===// // -// CHECK-LABEL: if ([&]() { -// CHECK-NEXT: MachineInstr &MI0 = I; -// CHECK-NEXT: if (MI0.getNumOperands() < 4) -// CHECK-NEXT: return false; -// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_SELECT) && -// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((Renderer0 = selectComplexPattern(MI0.getOperand(2)))))) && -// CHECK-NEXT: ((/* src3 */ (MRI.getType(MI0.getOperand(3).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((Renderer1 = selectComplexPattern(MI0.getOperand(3))))))) { -// CHECK-NEXT: // (select:i32 GPR32:i32:$src1, complex:i32:$src2, complex:i32:$src3) => (INSN2:i32 GPR32:i32:$src1, complex:i32:$src3, complex:i32:$src2) -// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::INSN2)); -// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/); -// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src1*/); -// CHECK-NEXT: Renderer1(MIB); -// CHECK-NEXT: Renderer0(MIB); -// CHECK-NEXT: for (const auto *FromMI : {&MI0, }) -// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands()) -// CHECK-NEXT: MIB.addMemOperand(MMO); -// CHECK-NEXT: I.eraseFromParent(); -// CHECK-NEXT: MachineInstr &NewI = *MIB; -// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI); -// CHECK-NEXT: return true; -// CHECK-NEXT: } +// CHECK-LABEL: MatchTable1[] = { +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/4, +// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/3, // MIs[1] +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/4, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_SELECT, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src1 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src2 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckComplexPattern, /*MI*/0, /*Op*/2, /*Renderer*/0, GICP_gi_complex, +// CHECK-NEXT: // MIs[0] Operand 3 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/3, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_SELECT, +// CHECK-NEXT: // MIs[1] Operand 0 +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: // MIs[1] src3 +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[1] src4 +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckComplexPattern, /*MI*/1, /*Op*/2, /*Renderer*/1, GICP_gi_complex, +// CHECK-NEXT: // MIs[1] src5 +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/3, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckComplexPattern, /*MI*/1, /*Op*/3, /*Renderer*/2, GICP_gi_complex, +// CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/1, +// CHECK-NEXT: // (select:i32 GPR32:i32:$src1, complex:i32:$src2, (select:i32 GPR32:i32:$src3, complex:i32:$src4, complex:i32:$src5)) => (INSN3:i32 GPR32:i32:$src1, complex:i32:$src2, GPR32:i32:$src3, complex:i32:$src4, complex:i32:$src5) +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::INSN3, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1 +// CHECK-NEXT: GIR_ComplexRenderer, /*InsnID*/0, /*RendererID*/0, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // src3 +// CHECK-NEXT: GIR_ComplexRenderer, /*InsnID*/0, /*RendererID*/1, +// CHECK-NEXT: GIR_ComplexRenderer, /*InsnID*/0, /*RendererID*/2, +// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable1\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable1, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } def : GINodeEquiv<G_SELECT, select>; def INSN2 : I<(outs GPR32:$dst), (ins GPR32Op:$src1, complex:$src2, complex:$src3), []>; @@ -104,119 +190,149 @@ def : Pat<(select GPR32:$src1, complex:$src2, complex:$src3), //===- Test a simple pattern with regclass operands. ----------------------===// -// CHECK-LABEL: if ([&]() { -// CHECK-NEXT: MachineInstr &MI0 = I; -// CHECK-NEXT: if (MI0.getNumOperands() < 3) -// CHECK-NEXT: return false; -// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_ADD) && -// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(2).getReg(), MRI, TRI)))))) { - -// CHECK-NEXT: // (add:i32 GPR32:i32:$src1, GPR32:i32:$src2) => (ADD:i32 GPR32:i32:$src1, GPR32:i32:$src2) -// CHECK-NEXT: I.setDesc(TII.get(MyTarget::ADD)); -// CHECK-NEXT: MachineInstr &NewI = I; -// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI); -// CHECK-NEXT: return true; -// CHECK-NEXT: } -// CHECK-NEXT: return false; -// CHECK-NEXT: }()) { return true; } - +// CHECK-LABEL: MatchTable2[] = { +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_ADD, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src1 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID +// CHECK-NEXT: // MIs[0] src2 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // (add:i32 GPR32:i32:$src1, GPR32:i32:$src2) => (ADD:i32 GPR32:i32:$src1, GPR32:i32:$src2) +// CHECK-NEXT: GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/ 0, /*Opcode*/MyTarget::ADD, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable2\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable2, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } def ADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2), [(set GPR32:$dst, (add GPR32:$src1, GPR32:$src2))]>; +//===- Test a simple pattern with an intrinsic. ---------------------------===// +// + +// CHECK-LABEL: MatchTable3[] = { +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_INTRINSIC, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] Operand 1 +// CHECK-NEXT: GIM_CheckIntrinsicID, /*MI*/0, /*Op*/1, Intrinsic::mytarget_nop, +// CHECK-NEXT: // MIs[0] src1 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // (intrinsic_wo_chain:i32 [[ID:[0-9]+]]:iPTR, GPR32:i32:$src1) => (MOV:i32 GPR32:i32:$src1) + +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::MOV, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/2, // src1 +// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable3\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable3, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } + +def MOV : I<(outs GPR32:$dst), (ins GPR32:$src1), + [(set GPR32:$dst, (int_mytarget_nop GPR32:$src1))]>; + //===- Test a nested instruction match. -----------------------------------===// -// CHECK-LABEL: if ([&]() { -// CHECK-NEXT: PredicateBitset ExpectedFeatures = {Feature_HasABit}; -// CHECK-NEXT: if ((AvailableFeatures & ExpectedFeatures) != ExpectedFeatures) -// CHECK-NEXT: return false; -// CHECK-NEXT: MachineInstr &MI0 = I; -// CHECK-NEXT: if (MI0.getNumOperands() < 3) -// CHECK-NEXT: return false; -// CHECK-NEXT: if (!MI0.getOperand(1).isReg()) -// CHECK-NEXT: return false; -// CHECK-NEXT: if (TRI.isPhysicalRegister(MI0.getOperand(1).getReg())) -// CHECK-NEXT: return false; -// CHECK-NEXT: MachineInstr &MI1 = *MRI.getVRegDef(MI0.getOperand(1).getReg()); -// CHECK-NEXT: if (MI1.getNumOperands() < 3) -// CHECK-NEXT: return false; -// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_MUL) && -// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* Operand 1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: (((MI1.getOpcode() == TargetOpcode::G_ADD) && -// CHECK-NEXT: ((/* Operand 0 */ (MRI.getType(MI1.getOperand(0).getReg()) == (LLT::scalar(32))))) && -// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI1.getOperand(1).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI1.getOperand(1).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src2 */ (MRI.getType(MI1.getOperand(2).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI1.getOperand(2).getReg(), MRI, TRI)))))) -// CHECK-NEXT: ))) && -// CHECK-NEXT: ((/* src3 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(2).getReg(), MRI, TRI)))))) { -// CHECK-NEXT: if (!isObviouslySafeToFold(MI1)) return false; -// CHECK-NEXT: // (mul:i32 (add:i32 GPR32:i32:$src1, GPR32:i32:$src2), GPR32:i32:$src3) => (MULADD:i32 GPR32:i32:$src1, GPR32:i32:$src2, GPR32:i32:$src3) -// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::MULADD)); -// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/); -// CHECK-NEXT: MIB.add(MI1.getOperand(1)/*src1*/); -// CHECK-NEXT: MIB.add(MI1.getOperand(2)/*src2*/); -// CHECK-NEXT: MIB.add(MI0.getOperand(2)/*src3*/); -// CHECK-NEXT: for (const auto *FromMI : {&MI0, &MI1, }) -// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands()) -// CHECK-NEXT: MIB.addMemOperand(MMO); -// CHECK-NEXT: I.eraseFromParent(); -// CHECK-NEXT: MachineInstr &NewI = *MIB; -// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI); -// CHECK-NEXT: return true; -// CHECK-NEXT: } +// CHECK-LABEL: MatchTable4[] = { +// CHECK-NEXT: GIM_CheckFeatures, GIFBS_HasA, +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, +// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1] +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_MUL, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] Operand 1 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_ADD, +// CHECK-NEXT: // MIs[1] Operand 0 +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: // MIs[1] src1 +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[1] src2 +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src3 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/1, +// CHECK-NEXT: // (mul:i32 (add:i32 GPR32:i32:$src1, GPR32:i32:$src2), GPR32:i32:$src3) => (MULADD:i32 GPR32:i32:$src1, GPR32:i32:$src2, GPR32:i32:$src3) +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::MULADD, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // src1 +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/2, // src2 +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/2, // src3 +// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable4\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable4, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } // We also get a second rule by commutativity. -// CHECK-LABEL: if ([&]() { -// CHECK-NEXT: PredicateBitset ExpectedFeatures = {Feature_HasABit}; -// CHECK-NEXT: if ((AvailableFeatures & ExpectedFeatures) != ExpectedFeatures) -// CHECK-NEXT: return false; -// CHECK-NEXT: MachineInstr &MI0 = I; -// CHECK-NEXT: if (MI0.getNumOperands() < 3) -// CHECK-NEXT: return false; -// CHECK-NEXT: if (!MI0.getOperand(2).isReg()) -// CHECK-NEXT: return false; -// CHECK-NEXT: if (TRI.isPhysicalRegister(MI0.getOperand(2).getReg())) -// CHECK-NEXT: return false; -// CHECK-NEXT: MachineInstr &MI1 = *MRI.getVRegDef(MI0.getOperand(2).getReg()); -// CHECK-NEXT: if (MI1.getNumOperands() < 3) -// CHECK-NEXT: return false; -// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_MUL) && -// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src3 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: (((MI1.getOpcode() == TargetOpcode::G_ADD) && -// CHECK-NEXT: ((/* Operand 0 */ (MRI.getType(MI1.getOperand(0).getReg()) == (LLT::scalar(32))))) && -// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI1.getOperand(1).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI1.getOperand(1).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src2 */ (MRI.getType(MI1.getOperand(2).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI1.getOperand(2).getReg(), MRI, TRI)))))) -// CHECK-NEXT: )))) { -// CHECK-NEXT: if (!isObviouslySafeToFold(MI1)) return false; -// CHECK-NEXT: // (mul:i32 GPR32:i32:$src3, (add:i32 GPR32:i32:$src1, GPR32:i32:$src2)) => (MULADD:i32 GPR32:i32:$src1, GPR32:i32:$src2, GPR32:i32:$src3) -// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::MULADD)); -// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/); -// CHECK-NEXT: MIB.add(MI1.getOperand(1)/*src1*/); -// CHECK-NEXT: MIB.add(MI1.getOperand(2)/*src2*/); -// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src3*/); -// CHECK-NEXT: for (const auto *FromMI : {&MI0, &MI1, }) -// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands()) -// CHECK-NEXT: MIB.addMemOperand(MMO); -// CHECK-NEXT: I.eraseFromParent(); -// CHECK-NEXT: MachineInstr &NewI = *MIB; -// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI); -// CHECK-NEXT: return true; -// CHECK-NEXT: } +// CHECK-LABEL: MatchTable5[] = { +// CHECK-NEXT: GIM_CheckFeatures, GIFBS_HasA, +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, +// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/2, +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_MUL, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src3 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] Operand 2 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_ADD, +// CHECK-NEXT: // MIs[1] Operand 0 +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: // MIs[1] src1 +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[1] src2 +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/1, +// CHECK-NEXT: // (mul:i32 GPR32:i32:$src3, (add:i32 GPR32:i32:$src1, GPR32:i32:$src2)) => (MULADD:i32 GPR32:i32:$src1, GPR32:i32:$src2, GPR32:i32:$src3) +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::MULADD, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // src1 +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/2, // src2 +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src3 +// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable5\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable5, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } def MULADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3), [(set GPR32:$dst, @@ -225,67 +341,129 @@ def MULADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3), //===- Test another simple pattern with regclass operands. ----------------===// -// CHECK-LABEL: if ([&]() { -// CHECK-NEXT: PredicateBitset ExpectedFeatures = {Feature_HasABit, Feature_HasBBit, Feature_HasCBit}; -// CHECK-NEXT: if ((AvailableFeatures & ExpectedFeatures) != ExpectedFeatures) -// CHECK-NEXT: return false; -// CHECK-NEXT: MachineInstr &MI0 = I; -// CHECK-NEXT: if (MI0.getNumOperands() < 3) -// CHECK-NEXT: return false; -// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_MUL) && -// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(2).getReg(), MRI, TRI)))))) { -// CHECK-NEXT: // (mul:i32 GPR32:i32:$src1, GPR32:i32:$src2) => (MUL:i32 GPR32:i32:$src2, GPR32:i32:$src1) -// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::MUL)); -// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/); -// CHECK-NEXT: MIB.add(MI0.getOperand(2)/*src2*/); -// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src1*/); -// CHECK-NEXT: for (const auto *FromMI : {&MI0, }) -// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands()) -// CHECK-NEXT: MIB.addMemOperand(MMO); -// CHECK-NEXT: I.eraseFromParent(); -// CHECK-NEXT: MachineInstr &NewI = *MIB; -// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI); -// CHECK-NEXT: return true; -// CHECK-NEXT: } -// CHECK-NEXT: return false; -// CHECK-NEXT: }()) { return true; } +// CHECK-LABEL: MatchTable6[] = { +// CHECK-NEXT: GIM_CheckFeatures, GIFBS_HasA_HasB_HasC, +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_MUL, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src1 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src2 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // (mul:i32 GPR32:i32:$src1, GPR32:i32:$src2) => (MUL:i32 GPR32:i32:$src2, GPR32:i32:$src1) +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::MUL, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/2, // src2 +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1 +// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable6\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable6, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } def MUL : I<(outs GPR32:$dst), (ins GPR32:$src2, GPR32:$src1), [(set GPR32:$dst, (mul GPR32:$src1, GPR32:$src2))]>, Requires<[HasA, HasB, HasC]>; +//===- Test a more complex multi-instruction match. -----------------------===// + +// CHECK-LABEL: MatchTable7[] = { +// CHECK-NEXT: GIM_CheckFeatures, GIFBS_HasA, +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, +// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/1, /*MI*/0, /*OpIdx*/1, // MIs[1] +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/1, /*Expected*/3, +// CHECK-NEXT: GIM_RecordInsn, /*DefineMI*/2, /*MI*/0, /*OpIdx*/2, // MIs[2] +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/2, /*Expected*/3, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_SUB, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] Operand 1 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/1, TargetOpcode::G_SUB, +// CHECK-NEXT: // MIs[1] Operand 0 +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: // MIs[1] src1 +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[1] src2 +// CHECK-NEXT: GIM_CheckType, /*MI*/1, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/1, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] Operand 2 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/2, TargetOpcode::G_SUB, +// CHECK-NEXT: // MIs[2] Operand 0 +// CHECK-NEXT: GIM_CheckType, /*MI*/2, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: // MIs[2] src3 +// CHECK-NEXT: GIM_CheckType, /*MI*/2, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/2, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[2] src4 +// CHECK-NEXT: GIM_CheckType, /*MI*/2, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/2, /*Op*/2, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/1, +// CHECK-NEXT: GIM_CheckIsSafeToFold, /*InsnID*/2, +// CHECK-NEXT: // (sub:i32 (sub:i32 GPR32:i32:$src1, GPR32:i32:$src2), (sub:i32 GPR32:i32:$src3, GPR32:i32:$src4)) => (INSNBOB:i32 GPR32:i32:$src1, GPR32:i32:$src2, GPR32:i32:$src3, GPR32:i32:$src4) +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::INSNBOB, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // src1 +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/2, // src2 +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/2, /*OpIdx*/1, // src3 +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/2, /*OpIdx*/2, // src4 +// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable7\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable7, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } + +def INSNBOB : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3, GPR32:$src4), + [(set GPR32:$dst, + (sub (sub GPR32:$src1, GPR32:$src2), (sub GPR32:$src3, GPR32:$src4)))]>, + Requires<[HasA]>; + //===- Test a pattern with ComplexPattern operands. -----------------------===// // -// CHECK-LABEL: if ([&]() { -// CHECK-NEXT: MachineInstr &MI0 = I; -// CHECK-NEXT: if (MI0.getNumOperands() < 3) -// CHECK-NEXT: return false; -// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_SUB) && -// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((Renderer0 = selectComplexPattern(MI0.getOperand(2))))))) { -// CHECK-NEXT: // (sub:i32 GPR32:i32:$src1, complex:i32:$src2) => (INSN1:i32 GPR32:i32:$src1, complex:i32:$src2) -// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::INSN1)); -// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/); -// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src1*/); -// CHECK-NEXT: Renderer0(MIB); -// CHECK-NEXT: for (const auto *FromMI : {&MI0, }) -// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands()) -// CHECK-NEXT: MIB.addMemOperand(MMO); -// CHECK-NEXT: I.eraseFromParent(); -// CHECK-NEXT: MachineInstr &NewI = *MIB; -// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI); -// CHECK-NEXT: return true; -// CHECK-NEXT: } +// CHECK-LABEL: MatchTable8[] = { +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_SUB, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src1 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src2 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckComplexPattern, /*MI*/0, /*Op*/2, /*Renderer*/0, GICP_gi_complex, +// CHECK-NEXT: // (sub:i32 GPR32:i32:$src1, complex:i32:$src2) => (INSN1:i32 GPR32:i32:$src1, complex:i32:$src2) +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::INSN1, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1 +// CHECK-NEXT: GIR_ComplexRenderer, /*InsnID*/0, /*RendererID*/0, +// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable8\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable8, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } def INSN1 : I<(outs GPR32:$dst), (ins GPR32:$src1, complex:$src2), []>; def : Pat<(sub GPR32:$src1, complex:$src2), (INSN1 GPR32:$src1, complex:$src2)>; @@ -293,32 +471,33 @@ def : Pat<(sub GPR32:$src1, complex:$src2), (INSN1 GPR32:$src1, complex:$src2)>; //===- Test a simple pattern with a default operand. ----------------------===// // -// CHECK-LABEL: if ([&]() { -// CHECK-NEXT: MachineInstr &MI0 = I; -// CHECK-NEXT: if (MI0.getNumOperands() < 3) -// CHECK-NEXT: return false; -// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_XOR) && -// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: (isOperandImmEqual(MI0.getOperand(2), -2, MRI))))) { -// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -2:i32) => (XORI:i32 GPR32:i32:$src1) -// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::XORI)); -// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/); -// CHECK-NEXT: MIB.addImm(-1); -// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src1*/); -// CHECK-NEXT: for (const auto *FromMI : {&MI0, }) -// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands()) -// CHECK-NEXT: MIB.addMemOperand(MMO); -// CHECK-NEXT: I.eraseFromParent(); -// CHECK-NEXT: MachineInstr &NewI = *MIB; -// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI); -// CHECK-NEXT: return true; -// CHECK-NEXT: } -// CHECK-NEXT: return false; -// CHECK-NEXT: }()) { return true; } +// CHECK-LABEL: MatchTable9[] = { +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_XOR, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src1 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] Operand 2 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckConstantInt, /*MI*/0, /*Op*/2, -2 +// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -2:i32) => (XORI:i32 GPR32:i32:$src1) +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::XORI, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// CHECK-NEXT: GIR_AddImm, /*InsnID*/0, /*Imm*/-1, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1 +// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable9\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable9, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } // The -2 is just to distinguish it from the 'not' case below. def XORI : I<(outs GPR32:$dst), (ins m1:$src2, GPR32:$src1), @@ -327,32 +506,33 @@ def XORI : I<(outs GPR32:$dst), (ins m1:$src2, GPR32:$src1), //===- Test a simple pattern with a default register operand. -------------===// // -// CHECK-LABEL: if ([&]() { -// CHECK-NEXT: MachineInstr &MI0 = I; -// CHECK-NEXT: if (MI0.getNumOperands() < 3) -// CHECK-NEXT: return false; -// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_XOR) && -// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: (isOperandImmEqual(MI0.getOperand(2), -3, MRI))))) { -// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -3:i32) => (XOR:i32 GPR32:i32:$src1) -// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::XOR)); -// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/); -// CHECK-NEXT: MIB.addReg(MyTarget::R0); -// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src1*/); -// CHECK-NEXT: for (const auto *FromMI : {&MI0, }) -// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands()) -// CHECK-NEXT: MIB.addMemOperand(MMO); -// CHECK-NEXT: I.eraseFromParent(); -// CHECK-NEXT: MachineInstr &NewI = *MIB; -// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI); -// CHECK-NEXT: return true; -// CHECK-NEXT: } -// CHECK-NEXT: return false; -// CHECK-NEXT: }()) { return true; } +// CHECK-LABEL: MatchTable10[] = { +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_XOR, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src1 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] Operand 2 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckConstantInt, /*MI*/0, /*Op*/2, -3 +// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -3:i32) => (XOR:i32 GPR32:i32:$src1) +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::XOR, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// CHECK-NEXT: GIR_AddRegister, /*InsnID*/0, MyTarget::R0, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1 +// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable10\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable10, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } // The -3 is just to distinguish it from the 'not' case below and the other default op case above. def XOR : I<(outs GPR32:$dst), (ins Z:$src2, GPR32:$src1), @@ -361,33 +541,34 @@ def XOR : I<(outs GPR32:$dst), (ins Z:$src2, GPR32:$src1), //===- Test a simple pattern with a multiple default operands. ------------===// // -// CHECK-LABEL: if ([&]() { -// CHECK-NEXT: MachineInstr &MI0 = I; -// CHECK-NEXT: if (MI0.getNumOperands() < 3) -// CHECK-NEXT: return false; -// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_XOR) && -// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: (isOperandImmEqual(MI0.getOperand(2), -4, MRI))))) { -// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -4:i32) => (XORlike:i32 GPR32:i32:$src1) -// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::XORlike)); -// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/); -// CHECK-NEXT: MIB.addImm(-1); -// CHECK-NEXT: MIB.addReg(MyTarget::R0); -// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src1*/); -// CHECK-NEXT: for (const auto *FromMI : {&MI0, }) -// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands()) -// CHECK-NEXT: MIB.addMemOperand(MMO); -// CHECK-NEXT: I.eraseFromParent(); -// CHECK-NEXT: MachineInstr &NewI = *MIB; -// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI); -// CHECK-NEXT: return true; -// CHECK-NEXT: } -// CHECK-NEXT: return false; -// CHECK-NEXT: }()) { return true; } +// CHECK-LABEL: MatchTable11[] = { +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_XOR, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src1 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] Operand 2 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckConstantInt, /*MI*/0, /*Op*/2, -4 +// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -4:i32) => (XORlike:i32 GPR32:i32:$src1) +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::XORlike, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// CHECK-NEXT: GIR_AddImm, /*InsnID*/0, /*Imm*/-1, +// CHECK-NEXT: GIR_AddRegister, /*InsnID*/0, MyTarget::R0, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1 +// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable11\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable11, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } // The -4 is just to distinguish it from the other 'not' cases. def XORlike : I<(outs GPR32:$dst), (ins m1Z:$src2, GPR32:$src1), @@ -396,34 +577,35 @@ def XORlike : I<(outs GPR32:$dst), (ins m1Z:$src2, GPR32:$src1), //===- Test a simple pattern with multiple operands with defaults. --------===// // -// CHECK-LABEL: if ([&]() { -// CHECK-NEXT: MachineInstr &MI0 = I; -// CHECK-NEXT: if (MI0.getNumOperands() < 3) -// CHECK-NEXT: return false; -// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_XOR) && -// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: (isOperandImmEqual(MI0.getOperand(2), -5, MRI))))) { -// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -5:i32) => (XORManyDefaults:i32 GPR32:i32:$src1) -// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::XORManyDefaults)); -// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/); -// CHECK-NEXT: MIB.addImm(-1); -// CHECK-NEXT: MIB.addReg(MyTarget::R0); -// CHECK-NEXT: MIB.addReg(MyTarget::R0); -// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*src1*/); -// CHECK-NEXT: for (const auto *FromMI : {&MI0, }) -// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands()) -// CHECK-NEXT: MIB.addMemOperand(MMO); -// CHECK-NEXT: I.eraseFromParent(); -// CHECK-NEXT: MachineInstr &NewI = *MIB; -// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI); -// CHECK-NEXT: return true; -// CHECK-NEXT: } -// CHECK-NEXT: return false; -// CHECK-NEXT: }()) { return true; } +// CHECK-LABEL: MatchTable12[] = { +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_XOR, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src1 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] Operand 2 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckConstantInt, /*MI*/0, /*Op*/2, -5, +// CHECK-NEXT: // (xor:i32 GPR32:i32:$src1, -5:i32) => (XORManyDefaults:i32 GPR32:i32:$src1) +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::XORManyDefaults, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// CHECK-NEXT: GIR_AddImm, /*InsnID*/0, /*Imm*/-1, +// CHECK-NEXT: GIR_AddRegister, /*InsnID*/0, MyTarget::R0, +// CHECK-NEXT: GIR_AddRegister, /*InsnID*/0, MyTarget::R0, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // src1 +// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable12\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable12, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } // The -5 is just to distinguish it from the other cases. def XORManyDefaults : I<(outs GPR32:$dst), (ins m1Z:$src3, Z:$src2, GPR32:$src1), @@ -434,32 +616,33 @@ def XORManyDefaults : I<(outs GPR32:$dst), (ins m1Z:$src3, Z:$src2, GPR32:$src1) // This must precede the 3-register variants because constant immediates have // priority over register banks. -// CHECK-LABEL: if ([&]() { -// CHECK-NEXT: MachineInstr &MI0 = I; -// CHECK-NEXT: if (MI0.getNumOperands() < 3) -// CHECK-NEXT: return false; -// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_XOR) && -// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* Wm */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: (isOperandImmEqual(MI0.getOperand(2), -1, MRI))))) { -// CHECK-NEXT: // (xor:i32 GPR32:i32:$Wm, -1:i32) => (ORN:i32 R0:i32, GPR32:i32:$Wm) -// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::ORN)); -// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/); -// CHECK-NEXT: MIB.addReg(MyTarget::R0); -// CHECK-NEXT: MIB.add(MI0.getOperand(1)/*Wm*/); -// CHECK-NEXT: for (const auto *FromMI : {&MI0, }) -// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands()) -// CHECK-NEXT: MIB.addMemOperand(MMO); -// CHECK-NEXT: I.eraseFromParent(); -// CHECK-NEXT: MachineInstr &NewI = *MIB; -// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI); -// CHECK-NEXT: return true; -// CHECK-NEXT: } -// CHECK-NEXT: return false; -// CHECK-NEXT: }()) { return true; } +// CHECK-LABEL: MatchTable13[] = { +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_XOR, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] Wm +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] Operand 2 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckConstantInt, /*MI*/0, /*Op*/2, -1, +// CHECK-NEXT: // (xor:i32 GPR32:i32:$Wm, -1:i32) => (ORN:i32 R0:i32, GPR32:i32:$Wm) +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::ORN, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// CHECK-NEXT: GIR_AddRegister, /*InsnID*/0, MyTarget::R0, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/1, // Wm +// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable13\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable13, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } def ORN : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2), []>; def : Pat<(not GPR32:$Wm), (ORN R0, GPR32:$Wm)>; @@ -467,70 +650,72 @@ def : Pat<(not GPR32:$Wm), (ORN R0, GPR32:$Wm)>; //===- Test a COPY_TO_REGCLASS --------------------------------------------===// // -// CHECK-LABEL: if ([&]() { -// CHECK-NEXT: MachineInstr &MI0 = I; -// CHECK-NEXT: if (MI0.getNumOperands() < 2) -// CHECK-NEXT: return false; -// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_BITCAST) && -// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::FPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI)))))) -// CHECK-NEXT: // (bitconvert:i32 FPR32:f32:$src1) => (COPY_TO_REGCLASS:i32 FPR32:f32:$src1, GPR32:i32) -// CHECK-NEXT: I.setDesc(TII.get(TargetOpcode::COPY)); -// CHECK-NEXT: MachineInstr &NewI = I; -// CHECK-NEXT: constrainOperandRegToRegClass(NewI, 0, MyTarget::GPR32RegClass, TII, TRI, RBI); -// CHECK-NEXT: return true; -// CHECK-NEXT: } -// CHECK-NEXT: return false; -// CHECK-NEXT: }()) { return true; } +// CHECK-LABEL: MatchTable14[] = { +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_BITCAST, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] src1 +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/1, /*RC*/MyTarget::FPR32RegClassID, +// CHECK-NEXT: // (bitconvert:i32 FPR32:f32:$src1) => (COPY_TO_REGCLASS:i32 FPR32:f32:$src1, GPR32:i32) +// CHECK-NEXT: GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/ 0, /*Opcode*/TargetOpcode::COPY, +// CHECK-NEXT: GIR_ConstrainOperandRC, /*InsnID*/0, /*Op*/0, /*RC GPR32*/ 1, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable14\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable14, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } def : Pat<(i32 (bitconvert FPR32:$src1)), (COPY_TO_REGCLASS FPR32:$src1, GPR32)>; //===- Test a simple pattern with just a leaf immediate. ------------------===// -// CHECK-LABEL: if ([&]() { -// CHECK-NEXT: MachineInstr &MI0 = I; -// CHECK-NEXT: if (MI0.getNumOperands() < 2) -// CHECK-NEXT: return false; -// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_CONSTANT) && -// CHECK-NEXT: ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) && -// CHECK-NEXT: ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) && -// CHECK-NEXT: ((/* Operand 1 */ (MI0.getOperand(1).isCImm() && MI0.getOperand(1).getCImm()->equalsInt(1))))) { -// CHECK-NEXT: // 1:i32 => (MOV1:i32) -// CHECK-NEXT: MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::MOV1)); -// CHECK-NEXT: MIB.add(MI0.getOperand(0)/*dst*/); -// CHECK-NEXT: for (const auto *FromMI : {&MI0, }) -// CHECK-NEXT: for (const auto &MMO : FromMI->memoperands()) -// CHECK-NEXT: MIB.addMemOperand(MMO); -// CHECK-NEXT: I.eraseFromParent(); -// CHECK-NEXT: MachineInstr &NewI = *MIB; -// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI); -// CHECK-NEXT: return true; -// CHECK-NEXT: } -// CHECK-NEXT: return false; -// CHECK-NEXT: }()) { return true; } +// CHECK-LABEL: MatchTable15[] = { +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/2, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_CONSTANT, +// CHECK-NEXT: // MIs[0] dst +// CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/0, /*Type*/GILLT_s32, +// CHECK-NEXT: GIM_CheckRegBankForClass, /*MI*/0, /*Op*/0, /*RC*/MyTarget::GPR32RegClassID, +// CHECK-NEXT: // MIs[0] Operand 1 +// CHECK-NEXT: GIM_CheckLiteralInt, /*MI*/0, /*Op*/1, 1, +// CHECK-NEXT: // 1:i32 => (MOV1:i32) +// CHECK-NEXT: GIR_BuildMI, /*InsnID*/0, /*Opcode*/MyTarget::MOV1, +// CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/0, /*OpIdx*/0, // dst +// CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable15\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable15, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } def MOV1 : I<(outs GPR32:$dst), (ins), [(set GPR32:$dst, 1)]>; //===- Test a pattern with an MBB operand. --------------------------------===// -// CHECK-LABEL: if ([&]() { -// CHECK-NEXT: MachineInstr &MI0 = I; -// CHECK-NEXT: if (MI0.getNumOperands() < 1) -// CHECK-NEXT: return false; -// CHECK-NEXT: if ((MI0.getOpcode() == TargetOpcode::G_BR) && -// CHECK-NEXT: ((/* target */ (MI0.getOperand(0).isMBB())))) { - -// CHECK-NEXT: // (br (bb:Other):$target) => (BR (bb:Other):$target) -// CHECK-NEXT: I.setDesc(TII.get(MyTarget::BR)); -// CHECK-NEXT: MachineInstr &NewI = I; -// CHECK-NEXT: constrainSelectedInstRegOperands(NewI, TII, TRI, RBI); -// CHECK-NEXT: return true; -// CHECK-NEXT: } -// CHECK-NEXT: return false; -// CHECK-NEXT: }()) { return true; } +// CHECK-LABEL: MatchTable16[] = { +// CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/1, +// CHECK-NEXT: GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_BR, +// CHECK-NEXT: // MIs[0] target +// CHECK-NEXT: GIM_CheckIsMBB, /*MI*/0, /*Op*/0, +// CHECK-NEXT: // (br (bb:Other):$target) => (BR (bb:Other):$target) +// CHECK-NEXT: GIR_MutateOpcode, /*InsnID*/0, /*RecycleInsnID*/ 0, /*Opcode*/MyTarget::BR, +// CHECK-NEXT: GIR_ConstrainSelectedInstOperands, /*InsnID*/0, +// CHECK-NEXT: GIR_Done, +// CHECK-NEXT: }; +// CHECK-NEXT: MIs.resize(1); +// CHECK-NEXT: DEBUG(dbgs() << "Processing MatchTable16\n"); +// CHECK-NEXT: if (executeMatchTable(*this, OutMIs, State, MatcherInfo, MatchTable16, TII, MRI, TRI, RBI, AvailableFeatures)) { +// CHECK-NEXT: return true; +// CHECK-NEXT: } def BR : I<(outs), (ins unknown:$target), [(br bb:$target)]>; diff --git a/test/TableGen/UnterminatedComment.td b/test/TableGen/UnterminatedComment.td index f92525a991644..f386e4cef83be 100644 --- a/test/TableGen/UnterminatedComment.td +++ b/test/TableGen/UnterminatedComment.td @@ -1,4 +1,4 @@ -// RUN: not llvm-tblgen < %s >& /dev/null +// RUN: not llvm-tblgen < %s > /dev/null 2>&1 def x; diff --git a/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll b/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll new file mode 100644 index 0000000000000..7ce8ab3ac5215 --- /dev/null +++ b/test/Transforms/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll @@ -0,0 +1,38 @@ +; RUN: opt -argpromotion -verify -dse -S %s -o - | FileCheck %s + +; Fix for PR33641. ArgumentPromotion removed the argument to bar but left the call to +; dbg.value which still used the removed argument. + +%p_t = type i16* +%fun_t = type void (%p_t)* + +define void @foo() { + %tmp = alloca %fun_t + store %fun_t @bar, %fun_t* %tmp + ret void +} + +define internal void @bar(%p_t %p) { + call void @llvm.dbg.value(metadata %p_t %p, i64 0, metadata !4, metadata !5), !dbg !6 + ret void +} + +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1) +!1 = !DIFile(filename: "test.c", directory: "") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = distinct !DISubprogram(name: "bar", unit: !0) +!4 = !DILocalVariable(name: "p", scope: !3) +!5 = !DIExpression() +!6 = !DILocation(line: 1, column: 1, scope: !3) + +; The %p argument should be removed, and the use of it in dbg.value should be +; changed to undef. +; CHECK: define internal void @bar() { +; CHECK-NEXT: call void @llvm.dbg.value(metadata i16* undef +; CHECK-NEXT: ret void +; CHECK-NEXT: } diff --git a/test/Transforms/CodeGenPrepare/X86/memcmp.ll b/test/Transforms/CodeGenPrepare/X86/memcmp.ll index 2435cd7d0a830..4b9e7c3956f58 100644 --- a/test/Transforms/CodeGenPrepare/X86/memcmp.ll +++ b/test/Transforms/CodeGenPrepare/X86/memcmp.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -codegenprepare -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32 ; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 @@ -5,8 +6,8 @@ declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp2( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i16* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i16* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* ; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] ; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] ; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) @@ -23,7 +24,7 @@ define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp3( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 3) ; ALL-NEXT: ret i32 [[CALL]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) @@ -32,8 +33,8 @@ define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp4( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* ; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] ; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] ; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) @@ -50,7 +51,7 @@ define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp5( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 5) ; ALL-NEXT: ret i32 [[CALL]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) @@ -59,7 +60,7 @@ define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp6( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 6) ; ALL-NEXT: ret i32 [[CALL]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) @@ -68,7 +69,7 @@ define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp7( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) ; ALL-NEXT: ret i32 [[CALL]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) @@ -77,12 +78,12 @@ define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-LABEL: @cmp8( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 8) ; X32-NEXT: ret i32 [[CALL]] ; ; X64-LABEL: @cmp8( -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* ; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] ; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] ; X64-NEXT: [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]]) @@ -99,7 +100,7 @@ define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp9( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) ; ALL-NEXT: ret i32 [[CALL]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) @@ -108,7 +109,7 @@ define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp10( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) ; ALL-NEXT: ret i32 [[CALL]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) @@ -117,7 +118,7 @@ define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp11( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) ; ALL-NEXT: ret i32 [[CALL]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) @@ -126,7 +127,7 @@ define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp12( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) ; ALL-NEXT: ret i32 [[CALL]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) @@ -135,7 +136,7 @@ define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp13( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) ; ALL-NEXT: ret i32 [[CALL]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) @@ -144,7 +145,7 @@ define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp14( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) ; ALL-NEXT: ret i32 [[CALL]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) @@ -153,7 +154,7 @@ define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp15( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) ; ALL-NEXT: ret i32 [[CALL]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) @@ -162,7 +163,7 @@ define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp16( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) ; ALL-NEXT: ret i32 [[CALL]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) @@ -171,8 +172,8 @@ define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq2( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i16* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i16* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* ; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]] ; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]] ; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] @@ -189,7 +190,7 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq3( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 3) ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] @@ -202,8 +203,8 @@ define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq4( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i32* +; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* ; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]] ; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]] ; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] @@ -220,7 +221,7 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq5( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 5) ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] @@ -233,7 +234,7 @@ define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq6( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 6) ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] @@ -246,7 +247,7 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq7( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 7) ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] @@ -259,14 +260,14 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X32-LABEL: @cmp_eq8( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 8) +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 8) ; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 ; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; X32-NEXT: ret i32 [[CONV]] ; ; X64-LABEL: @cmp_eq8( -; X64-NEXT: [[TMP1:%.*]] = bitcast i8* %x to i64* -; X64-NEXT: [[TMP2:%.*]] = bitcast i8* %y to i64* +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* ; X64-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]] ; X64-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP2]] ; X64-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]] @@ -283,7 +284,7 @@ define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq9( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 9) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] @@ -296,7 +297,7 @@ define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq10( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 10) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] @@ -309,7 +310,7 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq11( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 11) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] @@ -322,7 +323,7 @@ define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq12( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 12) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] @@ -335,7 +336,7 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq13( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 13) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] @@ -348,7 +349,7 @@ define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq14( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 14) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] @@ -361,7 +362,7 @@ define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq15( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 15) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] @@ -374,7 +375,7 @@ define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; ALL-LABEL: @cmp_eq16( -; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) +; ALL-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) ; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 ; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 ; ALL-NEXT: ret i32 [[CONV]] diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll index 9d6e668167fbb..b6b7757978263 100644 --- a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll +++ b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll @@ -4,6 +4,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" target triple = "x86_64-unknown-linux-gnu" +@x = external global [1 x [2 x <4 x float>]] + ; Can we sink single addressing mode computation to use? define void @test1(i1 %cond, i64* %base) { ; CHECK-LABEL: @test1 @@ -194,3 +196,25 @@ rare.2: declare void @slowpath(i32, i32*) + +; Make sure we don't end up in an infinite loop after we fail to sink. +; CHECK-LABEL: define void @test8 +; CHECK: %ptr = getelementptr i8, i8* %aFOO_load_ptr2int_2void, i32 undef +define void @test8() { +allocas: + %aFOO_load = load float*, float** undef + %aFOO_load_ptr2int = ptrtoint float* %aFOO_load to i64 + %aFOO_load_ptr2int_broadcast_init = insertelement <4 x i64> undef, i64 %aFOO_load_ptr2int, i32 0 + %aFOO_load_ptr2int_2void = inttoptr i64 %aFOO_load_ptr2int to i8* + %ptr = getelementptr i8, i8* %aFOO_load_ptr2int_2void, i32 undef + br label %load.i145 + +load.i145: + %ptr.i143 = bitcast i8* %ptr to <4 x float>* + %valall.i144 = load <4 x float>, <4 x float>* %ptr.i143, align 4 + %x_offset = getelementptr [1 x [2 x <4 x float>]], [1 x [2 x <4 x float>]]* @x, i32 0, i64 0 + br label %pl_loop.i.i122 + +pl_loop.i.i122: + br label %pl_loop.i.i122 +} diff --git a/test/Transforms/CodeGenPrepare/crash-on-large-allocas.ll b/test/Transforms/CodeGenPrepare/crash-on-large-allocas.ll new file mode 100644 index 0000000000000..3808c0e61c10a --- /dev/null +++ b/test/Transforms/CodeGenPrepare/crash-on-large-allocas.ll @@ -0,0 +1,16 @@ +; RUN: opt -S -codegenprepare %s -o - | FileCheck %s +; +; Ensure that we don't {crash,return a bad value} when given an alloca larger +; than what a pointer can represent. + +target datalayout = "p:16:16" + +; CHECK-LABEL: @alloca_overflow_is_unknown( +define i16 @alloca_overflow_is_unknown() { + %i = alloca i8, i32 65537 + %j = call i16 @llvm.objectsize.i16.p0i8(i8* %i, i1 false, i1 false) + ; CHECK: ret i16 -1 + ret i16 %j +} + +declare i16 @llvm.objectsize.i16.p0i8(i8*, i1, i1) diff --git a/test/Transforms/ConstantHoisting/ARM/bad-cases.ll b/test/Transforms/ConstantHoisting/ARM/bad-cases.ll index ffcfb2e56c95d..315e69998c627 100644 --- a/test/Transforms/ConstantHoisting/ARM/bad-cases.ll +++ b/test/Transforms/ConstantHoisting/ARM/bad-cases.ll @@ -107,3 +107,34 @@ entry: %ret = add i32 %cast0, %cast1 ret i32 %ret } + +@exception_type = external global i8 + +; Constants in inline ASM should not be hoisted. +define i32 @inline_asm_invoke() personality i8* null { +;CHECK-LABEL: @inline_asm_invoke +;CHECK-NOT: %const = 214672 +;CHECK: %X = invoke i32 asm "bswap $0", "=r,r"(i32 214672) + %X = invoke i32 asm "bswap $0", "=r,r"(i32 214672) + to label %L unwind label %lpad +;CHECK: %Y = invoke i32 asm "bswap $0", "=r,r"(i32 214672) + %Y = invoke i32 asm "bswap $0", "=r,r"(i32 214672) + to label %L unwind label %lpad +L: + ret i32 %X +lpad: + %lp = landingpad i32 + cleanup + catch i8* @exception_type + ret i32 1 +} + +define i32 @inline_asm_call() { +;CHECK-LABEL: @inline_asm_call +;CHECK-NOT: %const = 214672 +;CHECK: %X = call i32 asm "bswap $0", "=r,r"(i32 214672) + %X = call i32 asm "bswap $0", "=r,r"(i32 214672) +;CHECK: %Y = call i32 asm "bswap $0", "=r,r"(i32 214672) + %Y = call i32 asm "bswap $0", "=r,r"(i32 214672) + ret i32 %X +} diff --git a/test/Transforms/ConstantHoisting/ARM/insertvalue.ll b/test/Transforms/ConstantHoisting/ARM/insertvalue.ll new file mode 100644 index 0000000000000..99fe7fbe22a56 --- /dev/null +++ b/test/Transforms/ConstantHoisting/ARM/insertvalue.ll @@ -0,0 +1,31 @@ +; RUN: opt -consthoist -S < %s | FileCheck %s +target triple = "thumbv6m-none-eabi" + +%T = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, +i32, i32, i32, i32, i32, i32 } + +; The second operand of insertvalue is able to be hoisted. +define void @test1(%T %P) { +; CHECK-LABEL: @test1 +; CHECK: %const = bitcast i32 256 to i32 +; CHECK: %1 = insertvalue %T %P, i32 %const, 256 +; CHECK: %2 = insertvalue %T %P, i32 %const, 256 + %1 = insertvalue %T %P, i32 256, 256 + %2 = insertvalue %T %P, i32 256, 256 + ret void +} diff --git a/test/Transforms/ConstantHoisting/X86/ehpad.ll b/test/Transforms/ConstantHoisting/X86/ehpad.ll index 4f87572f34472..5e345c4515d71 100644 --- a/test/Transforms/ConstantHoisting/X86/ehpad.ll +++ b/test/Transforms/ConstantHoisting/X86/ehpad.ll @@ -1,9 +1,6 @@ -; RUN: opt -S -consthoist < %s | FileCheck %s +; RUN: opt -S -consthoist -consthoist-with-block-frequency=false < %s | FileCheck %s ; RUN: opt -S -consthoist -consthoist-with-block-frequency=true < %s | FileCheck --check-prefix=BFIHOIST %s -; FIXME: The catchpad doesn't even use the constant, so a better fix would be to -; insert the bitcast in the catchpad block. - target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-windows-msvc" diff --git a/test/Transforms/GVN/PRE/atomic.ll b/test/Transforms/GVN/PRE/atomic.ll index 509acd613e955..3479bc9a0e33a 100644 --- a/test/Transforms/GVN/PRE/atomic.ll +++ b/test/Transforms/GVN/PRE/atomic.ll @@ -208,14 +208,14 @@ define void @fence_seq_cst(i32* %P1, i32* %P2) { ret void } -; Can't DSE across a full singlethread fence +; Can't DSE across a full syncscope("singlethread") fence define void @fence_seq_cst_st(i32* %P1, i32* %P2) { ; CHECK-LABEL: @fence_seq_cst_st( ; CHECK: store -; CHECK: fence singlethread seq_cst +; CHECK: fence syncscope("singlethread") seq_cst ; CHECK: store store i32 0, i32* %P1, align 4 - fence singlethread seq_cst + fence syncscope("singlethread") seq_cst store i32 0, i32* %P1, align 4 ret void } diff --git a/test/Transforms/GVN/PRE/phi-translate-2.ll b/test/Transforms/GVN/PRE/phi-translate-2.ll deleted file mode 100644 index 78681e20df5e1..0000000000000 --- a/test/Transforms/GVN/PRE/phi-translate-2.ll +++ /dev/null @@ -1,131 +0,0 @@ -; RUN: opt < %s -gvn -S | FileCheck %s -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -@a = common global [100 x i64] zeroinitializer, align 16 -@b = common global [100 x i64] zeroinitializer, align 16 -@g1 = common global i64 0, align 8 -@g2 = common global i64 0, align 8 -@g3 = common global i64 0, align 8 -declare i64 @goo(...) local_unnamed_addr #1 - -define void @test1(i64 %a, i64 %b, i64 %c, i64 %d) { -entry: - %mul = mul nsw i64 %b, %a - store i64 %mul, i64* @g1, align 8 - %t0 = load i64, i64* @g2, align 8 - %cmp = icmp sgt i64 %t0, 3 - br i1 %cmp, label %if.then, label %if.end - -if.then: ; preds = %entry - %mul2 = mul nsw i64 %d, %c - store i64 %mul2, i64* @g2, align 8 - br label %if.end - -; Check phi-translate works and mul is removed. -; CHECK-LABEL: @test1( -; CHECK: if.end: -; CHECK: %[[MULPHI:.*]] = phi i64 [ {{.*}}, %if.then ], [ %mul, %entry ] -; CHECK-NOT: = mul -; CHECK: store i64 %[[MULPHI]], i64* @g3, align 8 -if.end: ; preds = %if.then, %entry - %b.addr.0 = phi i64 [ %d, %if.then ], [ %b, %entry ] - %a.addr.0 = phi i64 [ %c, %if.then ], [ %a, %entry ] - %mul3 = mul nsw i64 %a.addr.0, %b.addr.0 - store i64 %mul3, i64* @g3, align 8 - ret void -} - -define void @test2(i64 %i) { -entry: - %arrayidx = getelementptr inbounds [100 x i64], [100 x i64]* @a, i64 0, i64 %i - %t0 = load i64, i64* %arrayidx, align 8 - %arrayidx1 = getelementptr inbounds [100 x i64], [100 x i64]* @b, i64 0, i64 %i - %t1 = load i64, i64* %arrayidx1, align 8 - %mul = mul nsw i64 %t1, %t0 - store i64 %mul, i64* @g1, align 8 - %cmp = icmp sgt i64 %mul, 3 - br i1 %cmp, label %if.then, label %if.end - -; Check phi-translate works for the phi generated by loadpre. A new mul will be -; inserted in if.then block. -; CHECK-LABEL: @test2( -; CHECK: if.then: -; CHECK: %[[MUL_THEN:.*]] = mul -; CHECK: br label %if.end -if.then: ; preds = %entry - %call = tail call i64 (...) @goo() #2 - store i64 %call, i64* @g2, align 8 - br label %if.end - -; CHECK: if.end: -; CHECK: %[[MULPHI:.*]] = phi i64 [ %[[MUL_THEN]], %if.then ], [ %mul, %entry ] -; CHECK-NOT: = mul -; CHECK: store i64 %[[MULPHI]], i64* @g3, align 8 -if.end: ; preds = %if.then, %entry - %i.addr.0 = phi i64 [ 3, %if.then ], [ %i, %entry ] - %arrayidx3 = getelementptr inbounds [100 x i64], [100 x i64]* @a, i64 0, i64 %i.addr.0 - %t2 = load i64, i64* %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds [100 x i64], [100 x i64]* @b, i64 0, i64 %i.addr.0 - %t3 = load i64, i64* %arrayidx4, align 8 - %mul5 = mul nsw i64 %t3, %t2 - store i64 %mul5, i64* @g3, align 8 - ret void -} - -; Check phi-translate doesn't go through backedge, which may lead to incorrect -; pre transformation. -; CHECK: for.end: -; CHECK-NOT: %{{.*pre-phi}} = phi -; CHECK: ret void -define void @test3(i64 %N, i64* nocapture readonly %a) { -entry: - br label %for.cond - -for.cond: ; preds = %for.body, %entry - %i.0 = phi i64 [ 0, %entry ], [ %add, %for.body ] - %add = add nuw nsw i64 %i.0, 1 - %arrayidx = getelementptr inbounds i64, i64* %a, i64 %add - %tmp0 = load i64, i64* %arrayidx, align 8 - %cmp = icmp slt i64 %i.0, %N - br i1 %cmp, label %for.body, label %for.end - -for.body: ; preds = %for.cond - %call = tail call i64 (...) @goo() #2 - %add1 = sub nsw i64 0, %call - %tobool = icmp eq i64 %tmp0, %add1 - br i1 %tobool, label %for.cond, label %for.end - -for.end: ; preds = %for.body, %for.cond - %i.0.lcssa = phi i64 [ %i.0, %for.body ], [ %i.0, %for.cond ] - %arrayidx2 = getelementptr inbounds i64, i64* %a, i64 %i.0.lcssa - %tmp1 = load i64, i64* %arrayidx2, align 8 - store i64 %tmp1, i64* @g1, align 8 - ret void -} - -; It is incorrect to use the value of %andres in last loop iteration -; to do pre. -; CHECK-LABEL: @test4( -; CHECK: for.body: -; CHECK-NOT: %andres.pre-phi = phi i32 -; CHECK: br i1 %tobool1 - -define i32 @test4(i32 %cond, i32 %SectionAttrs.0231.ph, i32 *%AttrFlag) { -for.body.preheader: - %t514 = load volatile i32, i32* %AttrFlag - br label %for.body - -for.body: - %t320 = phi i32 [ %t334, %bb343 ], [ %t514, %for.body.preheader ] - %andres = and i32 %t320, %SectionAttrs.0231.ph - %tobool1 = icmp eq i32 %andres, 0 - br i1 %tobool1, label %bb343, label %critedge.loopexit - -bb343: - %t334 = load volatile i32, i32* %AttrFlag - %tobool2 = icmp eq i32 %cond, 0 - br i1 %tobool2, label %critedge.loopexit, label %for.body - -critedge.loopexit: - unreachable -} diff --git a/test/Transforms/GVN/PRE/pre-gep-load.ll b/test/Transforms/GVN/PRE/pre-gep-load.ll index 1b2b4d20d31da..9eec8bb6455b4 100644 --- a/test/Transforms/GVN/PRE/pre-gep-load.ll +++ b/test/Transforms/GVN/PRE/pre-gep-load.ll @@ -37,7 +37,7 @@ sw.bb2: ; preds = %if.end, %entry %3 = load double, double* %arrayidx5, align 8 ; CHECK: sw.bb2: ; CHECK-NOT: sext -; CHECK: phi double [ +; CHECK-NEXT: phi double [ ; CHECK-NOT: load %sub6 = fsub double 3.000000e+00, %3 br label %return diff --git a/test/Transforms/GVN/PRE/pre-load.ll b/test/Transforms/GVN/PRE/pre-load.ll index ffff2b7f08e53..685df24f62b65 100644 --- a/test/Transforms/GVN/PRE/pre-load.ll +++ b/test/Transforms/GVN/PRE/pre-load.ll @@ -72,7 +72,7 @@ block4: %PRE = load i32, i32* %P3 ret i32 %PRE ; CHECK: block4: -; CHECK: phi i32 [ +; CHECK-NEXT: phi i32 [ ; CHECK-NOT: load ; CHECK: ret i32 } @@ -104,7 +104,7 @@ block4: %PRE = load i32, i32* %P3 ret i32 %PRE ; CHECK: block4: -; CHECK: phi i32 [ +; CHECK-NEXT: phi i32 [ ; CHECK-NOT: load ; CHECK: ret i32 } @@ -263,7 +263,7 @@ block4: %PRE = load i32, i32* %P3 ret i32 %PRE ; CHECK: block4: -; CHECK: phi i32 [ +; CHECK-NEXT: phi i32 [ ; CHECK-NOT: load ; CHECK: ret i32 } diff --git a/test/Transforms/IndVarSimplify/canonicalize-cmp.ll b/test/Transforms/IndVarSimplify/canonicalize-cmp.ll new file mode 100644 index 0000000000000..2b939767284a4 --- /dev/null +++ b/test/Transforms/IndVarSimplify/canonicalize-cmp.ll @@ -0,0 +1,98 @@ +; RUN: opt -S -indvars < %s | FileCheck %s + +; Check that we replace signed comparisons between non-negative values with +; unsigned comparisons if we can. + +target datalayout = "n8:16:32:64" + +define i32 @test_01(i32 %a, i32 %b, i32* %p) { + +; CHECK-LABEL: @test_01( +; CHECK-NOT: icmp slt +; CHECK: %cmp1 = icmp ult i32 %iv, 100 +; CHECK: %cmp2 = icmp ult i32 %iv, 100 +; CHECK-NOT: %cmp3 +; CHECK: %exitcond = icmp ne i32 %iv.next, 1000 + +entry: + br label %loop.entry + +loop.entry: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.be ] + %cmp1 = icmp slt i32 %iv, 100 + br i1 %cmp1, label %b1, label %b2 + +b1: + store i32 %iv, i32* %p + br label %merge + +b2: + store i32 %a, i32* %p + br label %merge + +merge: + %cmp2 = icmp ult i32 %iv, 100 + br i1 %cmp2, label %b3, label %b4 + +b3: + store i32 %iv, i32* %p + br label %loop.be + +b4: + store i32 %b, i32* %p + br label %loop.be + +loop.be: + %iv.next = add i32 %iv, 1 + %cmp3 = icmp slt i32 %iv.next, 1000 + br i1 %cmp3, label %loop.entry, label %exit + +exit: + ret i32 %iv +} + +define i32 @test_02(i32 %a, i32 %b, i32* %p) { + +; CHECK-LABEL: @test_02( +; CHECK-NOT: icmp sgt +; CHECK: %cmp1 = icmp ugt i32 100, %iv +; CHECK: %cmp2 = icmp ugt i32 100, %iv +; CHECK-NOT: %cmp3 +; CHECK: %exitcond = icmp ne i32 %iv.next, 1000 + +entry: + br label %loop.entry + +loop.entry: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.be ] + %cmp1 = icmp sgt i32 100, %iv + br i1 %cmp1, label %b1, label %b2 + +b1: + store i32 %iv, i32* %p + br label %merge + +b2: + store i32 %a, i32* %p + br label %merge + +merge: + %cmp2 = icmp ugt i32 100, %iv + br i1 %cmp2, label %b3, label %b4 + +b3: + store i32 %iv, i32* %p + br label %loop.be + +b4: + store i32 %b, i32* %p + br label %loop.be + +loop.be: + %iv.next = add i32 %iv, 1 + %cmp3 = icmp sgt i32 1000, %iv.next + br i1 %cmp3, label %loop.entry, label %exit + +exit: + ret i32 %iv +} diff --git a/test/Transforms/IndVarSimplify/eliminate-comparison.ll b/test/Transforms/IndVarSimplify/eliminate-comparison.ll index 612f01e3cadee..a63617e62c0ea 100644 --- a/test/Transforms/IndVarSimplify/eliminate-comparison.ll +++ b/test/Transforms/IndVarSimplify/eliminate-comparison.ll @@ -111,7 +111,7 @@ return: ; Indvars should not turn the second loop into an infinite one. ; CHECK-LABEL: @func_11( -; CHECK: %tmp5 = icmp slt i32 %__key6.0, 10 +; CHECK: %tmp5 = icmp ult i32 %__key6.0, 10 ; CHECK-NOT: br i1 true, label %noassert68, label %unrolledend define i32 @func_11() nounwind uwtable { @@ -163,7 +163,7 @@ declare void @llvm.trap() noreturn nounwind ; In this case the second loop only has a single iteration, fold the header away ; CHECK-LABEL: @func_12( -; CHECK: %tmp5 = icmp slt i32 %__key6.0, 10 +; CHECK: %tmp5 = icmp ult i32 %__key6.0, 10 ; CHECK: br i1 true, label %noassert68, label %unrolledend define i32 @func_12() nounwind uwtable { entry: diff --git a/test/Transforms/IndVarSimplify/strengthen-overflow.ll b/test/Transforms/IndVarSimplify/strengthen-overflow.ll index 2bafe96e1cccd..6e0538e04d6bd 100644 --- a/test/Transforms/IndVarSimplify/strengthen-overflow.ll +++ b/test/Transforms/IndVarSimplify/strengthen-overflow.ll @@ -104,5 +104,89 @@ define i32 @test.unsigned.add.1(i32* %array, i32 %length, i32 %init) { ret i32 42 } +define hidden void @test.shl.exact.equal() { +; CHECK-LABEL: @test.shl.exact.equal +entry: + br label %for.body + +for.body: +; CHECK-LABEL: for.body + %k.021 = phi i32 [ 1, %entry ], [ %inc, %for.body ] + %shl = shl i32 1, %k.021 + %shr1 = ashr i32 %shl, 1 +; CHECK: %shr1 = ashr exact i32 %shl, 1 + %shr2 = lshr i32 %shl, 1 +; CHECK: %shr2 = lshr exact i32 %shl, 1 + %inc = add nuw nsw i32 %k.021, 1 + %exitcond = icmp eq i32 %inc, 9 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define hidden void @test.shl.exact.greater() { +; CHECK-LABEL: @test.shl.exact.greater +entry: + br label %for.body + +for.body: +; CHECK-LABEL: for.body + %k.021 = phi i32 [ 3, %entry ], [ %inc, %for.body ] + %shl = shl i32 1, %k.021 + %shr1 = ashr i32 %shl, 2 +; CHECK: %shr1 = ashr exact i32 %shl, 2 + %shr2 = lshr i32 %shl, 2 +; CHECK: %shr2 = lshr exact i32 %shl, 2 + %inc = add nuw nsw i32 %k.021, 1 + %exitcond = icmp eq i32 %inc, 9 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define hidden void @test.shl.exact.unbound(i32 %arg) { +; CHECK-LABEL: @test.shl.exact.unbound +entry: + br label %for.body + +for.body: +; CHECK-LABEL: for.body + %k.021 = phi i32 [ 2, %entry ], [ %inc, %for.body ] + %shl = shl i32 1, %k.021 + %shr1 = ashr i32 %shl, 2 +; CHECK: %shr1 = ashr exact i32 %shl, 2 + %shr2 = lshr i32 %shl, 2 +; CHECK: %shr2 = lshr exact i32 %shl, 2 + %inc = add nuw nsw i32 %k.021, 1 + %exitcond = icmp eq i32 %inc, %arg + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define hidden void @test.shl.nonexact() { +; CHECK-LABEL: @test.shl.nonexact +entry: + br label %for.body + +for.body: +; CHECK-LABEL: for.body + %k.021 = phi i32 [ 2, %entry ], [ %inc, %for.body ] + %shl = shl i32 1, %k.021 + %shr1 = ashr i32 %shl, 3 +; CHECK: %shr1 = ashr i32 %shl, 3 + %shr2 = lshr i32 %shl, 3 +; CHECK: %shr2 = lshr i32 %shl, 3 + %inc = add nuw nsw i32 %k.021, 1 + %exitcond = icmp eq i32 %inc, 9 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + !0 = !{i32 0, i32 2} !1 = !{i32 0, i32 42} diff --git a/test/Transforms/IndVarSimplify/widen-loop-comp.ll b/test/Transforms/IndVarSimplify/widen-loop-comp.ll index b87cd0550192e..2d24cd732ce84 100644 --- a/test/Transforms/IndVarSimplify/widen-loop-comp.ll +++ b/test/Transforms/IndVarSimplify/widen-loop-comp.ll @@ -64,7 +64,7 @@ for.end: ; CHECK-LABEL: @test2 ; CHECK: for.body4.us ; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 -; CHECK: %cmp2.us = icmp slt i64 +; CHECK: %cmp2.us = icmp ult i64 ; CHECK-NOT: %2 = trunc i64 %indvars.iv.next to i32 ; CHECK-NOT: %cmp2.us = icmp slt i32 diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll b/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll index b566c147e9b88..1eab707540300 100644 --- a/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll +++ b/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll @@ -170,4 +170,16 @@ define { i32 addrspace(4)*, i1 } @cmpxchg_group_to_flat_wrong_operand(i32 addrsp ret { i32 addrspace(4)*, i1 } %ret } +; Null pointer in local addr space +; CHECK-LABEL: @local_nullptr +; CHECK: icmp ne i8 addrspace(3)* %a, addrspacecast (i8* null to i8 addrspace(3)*) +; CHECK-NOT: i8 addrspace(3)* null +define void @local_nullptr(i32 addrspace(1)* nocapture %results, i8 addrspace(3)* %a) { +entry: + %tobool = icmp ne i8 addrspace(3)* %a, addrspacecast (i8* null to i8 addrspace(3)*) + %conv = zext i1 %tobool to i32 + store i32 %conv, i32 addrspace(1)* %results, align 4 + ret void +} + attributes #0 = { nounwind } diff --git a/test/Transforms/Inline/ARM/inline-target-attr.ll b/test/Transforms/Inline/ARM/inline-target-attr.ll new file mode 100644 index 0000000000000..5bbecd2035288 --- /dev/null +++ b/test/Transforms/Inline/ARM/inline-target-attr.ll @@ -0,0 +1,60 @@ +; RUN: opt < %s -mtriple=arm-unknown-linux-gnu -S -inline | FileCheck %s +; RUN: opt < %s -mtriple=arm-unknown-linux-gnu -S -passes='cgscc(inline)' | FileCheck %s +; Check that we only inline when we have compatible target attributes. +; ARM has implemented a target attribute that will verify that the attribute +; sets are compatible. + +define i32 @foo() #0 { +entry: + %call = call i32 (...) @baz() + ret i32 %call +; CHECK-LABEL: foo +; CHECK: call i32 (...) @baz() +} +declare i32 @baz(...) #0 + +define i32 @bar() #1 { +entry: + %call = call i32 @foo() + ret i32 %call +; CHECK-LABEL: bar +; CHECK: call i32 (...) @baz() +} + +define i32 @qux() #0 { +entry: + %call = call i32 @bar() + ret i32 %call +; CHECK-LABEL: qux +; CHECK: call i32 @bar() +} + +define i32 @thumb_fn() #2 { +entry: + %call = call i32 @foo() + ret i32 %call +; CHECK-LABEL: thumb_fn +; CHECK: call i32 @foo +} + +define i32 @strict_align() #3 { +entry: + %call = call i32 @foo() + ret i32 %call +; CHECK-LABEL: strict_align +; CHECK: call i32 (...) @baz() +} + +define i32 @soft_float_fn() #4 { +entry: + %call = call i32 @foo() + ret i32 %call +; CHECK-LABEL: soft_float_fn +; CHECK: call i32 @foo +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+dsp,+neon" } +attributes #1 = { "target-cpu"="generic" "target-features"="+dsp,+neon,+fp16" } +attributes #2 = { "target-cpu"="generic" "target-features"="+dsp,+neon,+fp16,+thumb-mode" } +attributes #3 = { "target-cpu"="generic" "target-features"="+dsp,+neon,+strict-align" } +attributes #4 = { "target-cpu"="generic" "target-features"="+dsp,+neon,+fp16,+soft-float" } diff --git a/test/Transforms/Inline/ARM/lit.local.cfg b/test/Transforms/Inline/ARM/lit.local.cfg new file mode 100644 index 0000000000000..236e1d3441665 --- /dev/null +++ b/test/Transforms/Inline/ARM/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'ARM' in config.root.targets: + config.unsupported = True diff --git a/test/Transforms/Inline/cgscc-incremental-invalidate.ll b/test/Transforms/Inline/cgscc-incremental-invalidate.ll index 82d321ccf225c..164f7a66a6f3c 100644 --- a/test/Transforms/Inline/cgscc-incremental-invalidate.ll +++ b/test/Transforms/Inline/cgscc-incremental-invalidate.ll @@ -11,17 +11,35 @@ ; CHECK: Running analysis: FunctionAnalysisManagerCGSCCProxy on (test1_f, test1_g, test1_h) ; CHECK: Running analysis: DominatorTreeAnalysis on test1_f ; CHECK: Running analysis: DominatorTreeAnalysis on test1_g -; CHECK: Invalidating all non-preserved analyses for: (test1_f, test1_g, test1_h) +; CHECK: Invalidating all non-preserved analyses for: (test1_f) ; CHECK: Invalidating all non-preserved analyses for: test1_f ; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_f +; CHECK: Invalidating analysis: LoopAnalysis on test1_f +; CHECK: Invalidating analysis: BranchProbabilityAnalysis on test1_f +; CHECK: Invalidating analysis: BlockFrequencyAnalysis on test1_f +; CHECK: Invalidating all non-preserved analyses for: (test1_g, test1_h) ; CHECK: Invalidating all non-preserved analyses for: test1_g ; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_g -; CHECK: Invalidating all non-preserved analyses for: test1_h -; CHECK-NOT: Invalidating anaylsis: -; CHECK: Running analysis: DominatorTreeAnalysis on test1_h -; CHECK: Invalidating all non-preserved analyses for: (test1_g, test1_h) +; CHECK: Invalidating analysis: LoopAnalysis on test1_g +; CHECK: Invalidating analysis: BranchProbabilityAnalysis on test1_g +; CHECK: Invalidating analysis: BlockFrequencyAnalysis on test1_g ; CHECK: Invalidating all non-preserved analyses for: test1_h ; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_h +; CHECK: Invalidating analysis: LoopAnalysis on test1_h +; CHECK: Invalidating analysis: BranchProbabilityAnalysis on test1_h +; CHECK: Invalidating analysis: BlockFrequencyAnalysis on test1_h +; CHECK-NOT: Invalidating analysis: +; CHECK: Starting llvm::Function pass manager run. +; CHECK-NEXT: Running pass: DominatorTreeVerifierPass on test1_g +; CHECK-NEXT: Running analysis: DominatorTreeAnalysis on test1_g +; CHECK-NEXT: Finished llvm::Function pass manager run. +; CHECK-NEXT: Starting llvm::Function pass manager run. +; CHECK-NEXT: Running pass: DominatorTreeVerifierPass on test1_h +; CHECK-NEXT: Running analysis: DominatorTreeAnalysis on test1_h +; CHECK-NEXT: Finished llvm::Function pass manager run. +; CHECK-NOT: Invalidating analysis: +; CHECK: Running pass: DominatorTreeVerifierPass on test1_f +; CHECK-NEXT: Running analysis: DominatorTreeAnalysis on test1_f ; An external function used to control branches. declare i1 @flag() @@ -109,3 +127,80 @@ entry: ret void ; CHECK: ret void } + +; The 'test2_' prefixed code works to carefully trigger forming an SCC with +; a dominator tree for one of the functions but not the other and without even +; a function analysis manager proxy for the SCC that things get merged into. +; Without proper handling when updating the call graph this will find a stale +; dominator tree. + +@test2_global = external global i32, align 4 + +define void @test2_hoge(i1 (i32*)* %arg) { +; CHECK-LABEL: define void @test2_hoge( +bb: + %tmp2 = call zeroext i1 %arg(i32* @test2_global) +; CHECK: call zeroext i1 %arg( + br label %bb3 + +bb3: + %tmp5 = call zeroext i1 %arg(i32* @test2_global) +; CHECK: call zeroext i1 %arg( + br i1 %tmp5, label %bb3, label %bb6 + +bb6: + ret void +} + +define zeroext i1 @test2_widget(i32* %arg) { +; CHECK-LABEL: define zeroext i1 @test2_widget( +bb: + %tmp1 = alloca i8, align 1 + %tmp2 = alloca i32, align 4 + call void @test2_quux() +; CHECK-NOT: call +; +; CHECK: call zeroext i1 @test2_widget(i32* @test2_global) +; CHECK-NEXT: br label %[[NEW_BB:.*]] +; +; CHECK: [[NEW_BB]]: +; CHECK-NEXT: call zeroext i1 @test2_widget(i32* @test2_global) +; +; CHECK: {{.*}}: + + call void @test2_hoge.1(i32* %arg) +; CHECK-NEXT: call void @test2_hoge.1( + + %tmp4 = call zeroext i1 @test2_barney(i32* %tmp2) + %tmp5 = zext i1 %tmp4 to i32 + store i32 %tmp5, i32* %tmp2, align 4 + %tmp6 = call zeroext i1 @test2_barney(i32* null) + call void @test2_ham(i8* %tmp1) +; CHECK: call void @test2_ham( + + call void @test2_quux() +; CHECK-NOT: call +; +; CHECK: call zeroext i1 @test2_widget(i32* @test2_global) +; CHECK-NEXT: br label %[[NEW_BB:.*]] +; +; CHECK: [[NEW_BB]]: +; CHECK-NEXT: call zeroext i1 @test2_widget(i32* @test2_global) +; +; CHECK: {{.*}}: + ret i1 true +; CHECK-NEXT: ret i1 true +} + +define internal void @test2_quux() { +; CHECK-NOT: @test2_quux +bb: + call void @test2_hoge(i1 (i32*)* @test2_widget) + ret void +} + +declare void @test2_hoge.1(i32*) + +declare zeroext i1 @test2_barney(i32*) + +declare void @test2_ham(i8*) diff --git a/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll b/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll new file mode 100644 index 0000000000000..3c4e08b5b515c --- /dev/null +++ b/test/Transforms/InstCombine/2017-07-07-UMul-ZExt.ll @@ -0,0 +1,29 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +; CHECK: llvm.umul.with.overflow +define i32 @sterix(i32, i8, i64) { +entry: + %conv = zext i32 %0 to i64 + %conv1 = sext i8 %1 to i32 + %mul = mul i32 %conv1, 1945964878 + %sh_prom = trunc i64 %2 to i32 + %shr = lshr i32 %mul, %sh_prom + %conv2 = zext i32 %shr to i64 + %mul3 = mul nuw nsw i64 %conv, %conv2 + %conv6 = and i64 %mul3, 4294967295 + %tobool = icmp ne i64 %conv6, %mul3 + br i1 %tobool, label %lor.end, label %lor.rhs + +lor.rhs: + %and = and i64 %2, %mul3 + %conv4 = trunc i64 %and to i32 + %tobool7 = icmp ne i32 %conv4, 0 + %lnot = xor i1 %tobool7, true + br label %lor.end + +lor.end: + %3 = phi i1 [ true, %entry ], [ %lnot, %lor.rhs ] + %conv8 = zext i1 %3 to i32 + ret i32 %conv8 +} + diff --git a/test/Transforms/InstCombine/and-or-not.ll b/test/Transforms/InstCombine/and-or-not.ll index 1baecb4a13a3b..04f7be01eaf5c 100644 --- a/test/Transforms/InstCombine/and-or-not.ll +++ b/test/Transforms/InstCombine/and-or-not.ll @@ -570,10 +570,8 @@ define i32 @xor_to_xnor1(float %fa, float %fb) { ; CHECK-LABEL: @xor_to_xnor1( ; CHECK-NEXT: [[A:%.*]] = fptosi float [[FA:%.*]] to i32 ; CHECK-NEXT: [[B:%.*]] = fptosi float [[FB:%.*]] to i32 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[B]] -; CHECK-NEXT: [[OR2_DEMORGAN:%.*]] = and i32 [[A]], [[B]] -; CHECK-NEXT: [[OR2:%.*]] = xor i32 [[OR2_DEMORGAN]], -1 -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR1]], [[OR2]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A]], [[B]] +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1 ; CHECK-NEXT: ret i32 [[XOR]] ; %a = fptosi float %fa to i32 @@ -591,10 +589,8 @@ define i32 @xor_to_xnor2(float %fa, float %fb) { ; CHECK-LABEL: @xor_to_xnor2( ; CHECK-NEXT: [[A:%.*]] = fptosi float [[FA:%.*]] to i32 ; CHECK-NEXT: [[B:%.*]] = fptosi float [[FB:%.*]] to i32 -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[A]], [[B]] -; CHECK-NEXT: [[OR2_DEMORGAN:%.*]] = and i32 [[B]], [[A]] -; CHECK-NEXT: [[OR2:%.*]] = xor i32 [[OR2_DEMORGAN]], -1 -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR1]], [[OR2]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A]], [[B]] +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1 ; CHECK-NEXT: ret i32 [[XOR]] ; %a = fptosi float %fa to i32 @@ -612,10 +608,8 @@ define i32 @xor_to_xnor3(float %fa, float %fb) { ; CHECK-LABEL: @xor_to_xnor3( ; CHECK-NEXT: [[A:%.*]] = fptosi float [[FA:%.*]] to i32 ; CHECK-NEXT: [[B:%.*]] = fptosi float [[FB:%.*]] to i32 -; CHECK-NEXT: [[OR1_DEMORGAN:%.*]] = and i32 [[A]], [[B]] -; CHECK-NEXT: [[OR1:%.*]] = xor i32 [[OR1_DEMORGAN]], -1 -; CHECK-NEXT: [[OR2:%.*]] = or i32 [[A]], [[B]] -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR2]], [[OR1]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A]], [[B]] +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1 ; CHECK-NEXT: ret i32 [[XOR]] ; %a = fptosi float %fa to i32 @@ -633,10 +627,8 @@ define i32 @xor_to_xnor4(float %fa, float %fb) { ; CHECK-LABEL: @xor_to_xnor4( ; CHECK-NEXT: [[A:%.*]] = fptosi float [[FA:%.*]] to i32 ; CHECK-NEXT: [[B:%.*]] = fptosi float [[FB:%.*]] to i32 -; CHECK-NEXT: [[OR1_DEMORGAN:%.*]] = and i32 [[A]], [[B]] -; CHECK-NEXT: [[OR1:%.*]] = xor i32 [[OR1_DEMORGAN]], -1 -; CHECK-NEXT: [[OR2:%.*]] = or i32 [[B]], [[A]] -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR2]], [[OR1]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[B]], [[A]] +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1 ; CHECK-NEXT: ret i32 [[XOR]] ; %a = fptosi float %fa to i32 diff --git a/test/Transforms/InstCombine/bswap-fold.ll b/test/Transforms/InstCombine/bswap-fold.ll index 91678a91962a8..260e2330996ed 100644 --- a/test/Transforms/InstCombine/bswap-fold.ll +++ b/test/Transforms/InstCombine/bswap-fold.ll @@ -1,35 +1,6 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s -define i1 @test1(i16 %t) { -; CHECK-LABEL: @test1( -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i16 %t, 256 -; CHECK-NEXT: ret i1 [[TMP2]] -; - %tmp1 = call i16 @llvm.bswap.i16( i16 %t ) - %tmp2 = icmp eq i16 %tmp1, 1 - ret i1 %tmp2 -} - -define i1 @test2(i32 %tmp) { -; CHECK-LABEL: @test2( -; CHECK-NEXT: [[TMP_UPGRD_1:%.*]] = icmp eq i32 %tmp, 16777216 -; CHECK-NEXT: ret i1 [[TMP_UPGRD_1]] -; - %tmp34 = tail call i32 @llvm.bswap.i32( i32 %tmp ) - %tmp.upgrd.1 = icmp eq i32 %tmp34, 1 - ret i1 %tmp.upgrd.1 -} - -define i1 @test3(i64 %tmp) { -; CHECK-LABEL: @test3( -; CHECK-NEXT: [[TMP_UPGRD_2:%.*]] = icmp eq i64 %tmp, 72057594037927936 -; CHECK-NEXT: ret i1 [[TMP_UPGRD_2]] -; - %tmp34 = tail call i64 @llvm.bswap.i64( i64 %tmp ) - %tmp.upgrd.2 = icmp eq i64 %tmp34, 1 - ret i1 %tmp.upgrd.2 -} - ; rdar://5992453 ; A & 255 define i32 @test4(i32 %a) nounwind { @@ -241,6 +212,136 @@ define i64 @bs_xor64(i64 %a, i64 %b) #0 { ret i64 %tmp3 } +define <2 x i32> @bs_and32vec(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: @bs_and32vec( +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP1]]) +; CHECK-NEXT: ret <2 x i32> [[TMP2]] +; + %tmp1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a) + %tmp2 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %b) + %tmp3 = and <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <2 x i32> @bs_or32vec(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: @bs_or32vec( +; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP1]]) +; CHECK-NEXT: ret <2 x i32> [[TMP2]] +; + %tmp1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a) + %tmp2 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %b) + %tmp3 = or <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <2 x i32> @bs_xor32vec(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: @bs_xor32vec( +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP1]]) +; CHECK-NEXT: ret <2 x i32> [[TMP2]] +; + %tmp1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a) + %tmp2 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %b) + %tmp3 = xor <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <2 x i32> @bs_and32ivec(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: @bs_and32ivec( +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[A:%.*]], <i32 -1585053440, i32 -1585053440> +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP1]]) +; CHECK-NEXT: ret <2 x i32> [[TMP2]] +; + %tmp1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a) + %tmp2 = and <2 x i32> %tmp1, <i32 100001, i32 100001> + ret <2 x i32> %tmp2 +} + +define <2 x i32> @bs_or32ivec(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: @bs_or32ivec( +; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[A:%.*]], <i32 -1585053440, i32 -1585053440> +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP1]]) +; CHECK-NEXT: ret <2 x i32> [[TMP2]] +; + %tmp1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a) + %tmp2 = or <2 x i32> %tmp1, <i32 100001, i32 100001> + ret <2 x i32> %tmp2 +} + +define <2 x i32> @bs_xor32ivec(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: @bs_xor32ivec( +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[A:%.*]], <i32 -1585053440, i32 -1585053440> +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[TMP1]]) +; CHECK-NEXT: ret <2 x i32> [[TMP2]] +; + %tmp1 = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a) + %tmp2 = xor <2 x i32> %tmp1, <i32 100001, i32 100001> + ret <2 x i32> %tmp2 +} + +define i64 @bs_and64_multiuse1(i64 %a, i64 %b) #0 { +; CHECK-LABEL: @bs_and64_multiuse1( +; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[B:%.*]]) +; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], [[TMP2]] +; CHECK-NEXT: ret i64 [[TMP5]] +; + %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a) + %tmp2 = tail call i64 @llvm.bswap.i64(i64 %b) + %tmp3 = and i64 %tmp1, %tmp2 + %tmp4 = mul i64 %tmp3, %tmp1 ; to increase use count of the bswaps + %tmp5 = mul i64 %tmp4, %tmp2 ; to increase use count of the bswaps + ret i64 %tmp5 +} + +define i64 @bs_and64_multiuse2(i64 %a, i64 %b) #0 { +; CHECK-LABEL: @bs_and64_multiuse2( +; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[A]], [[B:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], [[TMP1]] +; CHECK-NEXT: ret i64 [[TMP4]] +; + %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a) + %tmp2 = tail call i64 @llvm.bswap.i64(i64 %b) + %tmp3 = and i64 %tmp1, %tmp2 + %tmp4 = mul i64 %tmp3, %tmp1 ; to increase use count of the bswaps + ret i64 %tmp4 +} + +define i64 @bs_and64_multiuse3(i64 %a, i64 %b) #0 { +; CHECK-LABEL: @bs_and64_multiuse3( +; CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[B:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[A:%.*]], [[B]] +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], [[TMP2]] +; CHECK-NEXT: ret i64 [[TMP4]] +; + %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a) + %tmp2 = tail call i64 @llvm.bswap.i64(i64 %b) + %tmp3 = and i64 %tmp1, %tmp2 + %tmp4 = mul i64 %tmp3, %tmp2 ; to increase use count of the bswaps + ret i64 %tmp4 +} + +define i64 @bs_and64i_multiuse(i64 %a, i64 %b) #0 { +; CHECK-LABEL: @bs_and64i_multiuse( +; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[A:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[TMP1]], 1000000001 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], [[TMP1]] +; CHECK-NEXT: ret i64 [[TMP3]] +; + %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a) + %tmp2 = and i64 %tmp1, 1000000001 + %tmp3 = mul i64 %tmp2, %tmp1 ; to increase use count of the bswap + ret i64 %tmp3 +} + declare i16 @llvm.bswap.i16(i16) declare i32 @llvm.bswap.i32(i32) declare i64 @llvm.bswap.i64(i64) +declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) diff --git a/test/Transforms/InstCombine/cmp-intrinsic.ll b/test/Transforms/InstCombine/cmp-intrinsic.ll new file mode 100644 index 0000000000000..7fc1d12916bf8 --- /dev/null +++ b/test/Transforms/InstCombine/cmp-intrinsic.ll @@ -0,0 +1,123 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare i16 @llvm.bswap.i16(i16) +declare i32 @llvm.bswap.i32(i32) +declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) +declare i33 @llvm.cttz.i33(i33, i1) +declare i32 @llvm.ctlz.i32(i32, i1) +declare i8 @llvm.ctpop.i8(i8) +declare i11 @llvm.ctpop.i11(i11) +declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) +declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) +declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) + +define i1 @bswap_eq_i16(i16 %x) { +; CHECK-LABEL: @bswap_eq_i16( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 %x, 256 +; CHECK-NEXT: ret i1 [[CMP]] +; + %bs = call i16 @llvm.bswap.i16(i16 %x) + %cmp = icmp eq i16 %bs, 1 + ret i1 %cmp +} + +define i1 @bswap_ne_i32(i32 %x) { +; CHECK-LABEL: @bswap_ne_i32( +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 %x, 33554432 +; CHECK-NEXT: ret i1 [[CMP]] +; + %bs = tail call i32 @llvm.bswap.i32(i32 %x) + %cmp = icmp ne i32 %bs, 2 + ret i1 %cmp +} + +define <2 x i1> @bswap_eq_v2i64(<2 x i64> %x) { +; CHECK-LABEL: @bswap_eq_v2i64( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i64> %x, <i64 216172782113783808, i64 216172782113783808> +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %bs = tail call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %x) + %cmp = icmp eq <2 x i64> %bs, <i64 3, i64 3> + ret <2 x i1> %cmp +} + +define i1 @ctlz_eq_bitwidth_i32(i32 %x) { +; CHECK-LABEL: @ctlz_eq_bitwidth_i32( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 %x, 0 +; CHECK-NEXT: ret i1 [[CMP]] +; + %lz = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false) + %cmp = icmp eq i32 %lz, 32 + ret i1 %cmp +} + +define <2 x i1> @ctlz_ne_bitwidth_v2i32(<2 x i32> %a) { +; CHECK-LABEL: @ctlz_ne_bitwidth_v2i32( +; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i32> %a, zeroinitializer +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %x = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) + %cmp = icmp ne <2 x i32> %x, <i32 32, i32 32> + ret <2 x i1> %cmp +} + +define i1 @cttz_ne_bitwidth_i33(i33 %x) { +; CHECK-LABEL: @cttz_ne_bitwidth_i33( +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i33 %x, 0 +; CHECK-NEXT: ret i1 [[CMP]] +; + %tz = tail call i33 @llvm.cttz.i33(i33 %x, i1 false) + %cmp = icmp ne i33 %tz, 33 + ret i1 %cmp +} + +define <2 x i1> @cttz_eq_bitwidth_v2i32(<2 x i32> %a) { +; CHECK-LABEL: @cttz_eq_bitwidth_v2i32( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> %a, zeroinitializer +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %x = tail call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false) + %cmp = icmp eq <2 x i32> %x, <i32 32, i32 32> + ret <2 x i1> %cmp +} + +define i1 @ctpop_eq_zero_i11(i11 %x) { +; CHECK-LABEL: @ctpop_eq_zero_i11( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i11 %x, 0 +; CHECK-NEXT: ret i1 [[CMP]] +; + %pop = tail call i11 @llvm.ctpop.i11(i11 %x) + %cmp = icmp eq i11 %pop, 0 + ret i1 %cmp +} + +define <2 x i1> @ctpop_ne_zero_v2i32(<2 x i32> %x) { +; CHECK-LABEL: @ctpop_ne_zero_v2i32( +; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i32> %x, zeroinitializer +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %pop = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x) + %cmp = icmp ne <2 x i32> %pop, zeroinitializer + ret <2 x i1> %cmp +} + +define i1 @ctpop_eq_bitwidth_i8(i8 %x) { +; CHECK-LABEL: @ctpop_eq_bitwidth_i8( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 %x, -1 +; CHECK-NEXT: ret i1 [[CMP]] +; + %pop = tail call i8 @llvm.ctpop.i8(i8 %x) + %cmp = icmp eq i8 %pop, 8 + ret i1 %cmp +} + +define <2 x i1> @ctpop_ne_bitwidth_v2i32(<2 x i32> %x) { +; CHECK-LABEL: @ctpop_ne_bitwidth_v2i32( +; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i32> %x, <i32 -1, i32 -1> +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %pop = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x) + %cmp = icmp ne <2 x i32> %pop, <i32 32, i32 32> + ret <2 x i1> %cmp +} + diff --git a/test/Transforms/InstCombine/consecutive-fences.ll b/test/Transforms/InstCombine/consecutive-fences.ll index 6f1c412773861..8ecb399f39cb8 100644 --- a/test/Transforms/InstCombine/consecutive-fences.ll +++ b/test/Transforms/InstCombine/consecutive-fences.ll @@ -4,7 +4,7 @@ ; CHECK-LABEL: define void @tinkywinky ; CHECK-NEXT: fence seq_cst -; CHECK-NEXT: fence singlethread acquire +; CHECK-NEXT: fence syncscope("singlethread") acquire ; CHECK-NEXT: ret void ; CHECK-NEXT: } @@ -12,21 +12,21 @@ define void @tinkywinky() { fence seq_cst fence seq_cst fence seq_cst - fence singlethread acquire - fence singlethread acquire - fence singlethread acquire + fence syncscope("singlethread") acquire + fence syncscope("singlethread") acquire + fence syncscope("singlethread") acquire ret void } ; CHECK-LABEL: define void @dipsy ; CHECK-NEXT: fence seq_cst -; CHECK-NEXT: fence singlethread seq_cst +; CHECK-NEXT: fence syncscope("singlethread") seq_cst ; CHECK-NEXT: ret void ; CHECK-NEXT: } define void @dipsy() { fence seq_cst - fence singlethread seq_cst + fence syncscope("singlethread") seq_cst ret void } diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll index 127fde10e9f7b..a12f4206b1c6d 100644 --- a/test/Transforms/InstCombine/icmp.ll +++ b/test/Transforms/InstCombine/icmp.ll @@ -2979,9 +2979,7 @@ declare i32 @llvm.bswap.i32(i32) define i1 @bswap_ne(i32 %x, i32 %y) { ; CHECK-LABEL: @bswap_ne( -; CHECK-NEXT: [[SWAPX:%.*]] = call i32 @llvm.bswap.i32(i32 %x) -; CHECK-NEXT: [[SWAPY:%.*]] = call i32 @llvm.bswap.i32(i32 %y) -; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[SWAPX]], [[SWAPY]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 %x, %y ; CHECK-NEXT: ret i1 [[CMP]] ; %swapx = call i32 @llvm.bswap.i32(i32 %x) @@ -2994,9 +2992,7 @@ declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>) define <8 x i1> @bswap_vec_eq(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: @bswap_vec_eq( -; CHECK-NEXT: [[SWAPX:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %x) -; CHECK-NEXT: [[SWAPY:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %y) -; CHECK-NEXT: [[CMP:%.*]] = icmp eq <8 x i16> [[SWAPX]], [[SWAPY]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <8 x i16> %x, %y ; CHECK-NEXT: ret <8 x i1> [[CMP]] ; %swapx = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %x) @@ -3009,9 +3005,7 @@ declare i64 @llvm.bitreverse.i64(i64) define i1 @bitreverse_eq(i64 %x, i64 %y) { ; CHECK-LABEL: @bitreverse_eq( -; CHECK-NEXT: [[REVX:%.*]] = call i64 @llvm.bitreverse.i64(i64 %x) -; CHECK-NEXT: [[REVY:%.*]] = call i64 @llvm.bitreverse.i64(i64 %y) -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[REVX]], [[REVY]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 %x, %y ; CHECK-NEXT: ret i1 [[CMP]] ; %revx = call i64 @llvm.bitreverse.i64(i64 %x) @@ -3024,9 +3018,7 @@ declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) define <8 x i1> @bitreverse_vec_ne(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: @bitreverse_vec_ne( -; CHECK-NEXT: [[REVX:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %x) -; CHECK-NEXT: [[REVY:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %y) -; CHECK-NEXT: [[CMP:%.*]] = icmp ne <8 x i16> [[REVX]], [[REVY]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ne <8 x i16> %x, %y ; CHECK-NEXT: ret <8 x i1> [[CMP]] ; %revx = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %x) diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll index c294d79f15efe..8d2f06edcaf3d 100644 --- a/test/Transforms/InstCombine/intrinsics.ll +++ b/test/Transforms/InstCombine/intrinsics.ll @@ -475,66 +475,6 @@ define <2 x i1> @ctlz_knownbits3_vec(<2 x i8> %arg) { ret <2 x i1> %res } -define void @cmp.simplify(i32 %a, i32 %b, i1* %c) { - %lz = tail call i32 @llvm.ctlz.i32(i32 %a, i1 false) nounwind readnone - %lz.cmp = icmp eq i32 %lz, 32 - store volatile i1 %lz.cmp, i1* %c - %tz = tail call i32 @llvm.cttz.i32(i32 %a, i1 false) nounwind readnone - %tz.cmp = icmp ne i32 %tz, 32 - store volatile i1 %tz.cmp, i1* %c - %pop0 = tail call i32 @llvm.ctpop.i32(i32 %b) nounwind readnone - %pop0.cmp = icmp eq i32 %pop0, 0 - store volatile i1 %pop0.cmp, i1* %c - %pop1 = tail call i32 @llvm.ctpop.i32(i32 %b) nounwind readnone - %pop1.cmp = icmp eq i32 %pop1, 32 - store volatile i1 %pop1.cmp, i1* %c - ret void -; CHECK: @cmp.simplify -; CHECK-NEXT: %lz.cmp = icmp eq i32 %a, 0 -; CHECK-NEXT: store volatile i1 %lz.cmp, i1* %c -; CHECK-NEXT: %tz.cmp = icmp ne i32 %a, 0 -; CHECK-NEXT: store volatile i1 %tz.cmp, i1* %c -; CHECK-NEXT: %pop0.cmp = icmp eq i32 %b, 0 -; CHECK-NEXT: store volatile i1 %pop0.cmp, i1* %c -; CHECK-NEXT: %pop1.cmp = icmp eq i32 %b, -1 -; CHECK-NEXT: store volatile i1 %pop1.cmp, i1* %c -} - -define <2 x i1> @ctlz_cmp_vec(<2 x i32> %a) { -; CHECK-LABEL: @ctlz_cmp_vec( -; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> %a, zeroinitializer -; CHECK-NEXT: ret <2 x i1> [[CMP]] -; - %x = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind readnone - %cmp = icmp eq <2 x i32> %x, <i32 32, i32 32> - ret <2 x i1> %cmp -} - -define <2 x i1> @cttz_cmp_vec(<2 x i32> %a) { -; CHECK-LABEL: @cttz_cmp_vec( -; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i32> %a, zeroinitializer -; CHECK-NEXT: ret <2 x i1> [[CMP]] -; - %x = tail call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false) nounwind readnone - %cmp = icmp ne <2 x i32> %x, <i32 32, i32 32> - ret <2 x i1> %cmp -} - -define void @ctpop_cmp_vec(<2 x i32> %a, <2 x i1>* %b) { - %pop0 = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) nounwind readnone - %pop0.cmp = icmp eq <2 x i32> %pop0, zeroinitializer - store volatile <2 x i1> %pop0.cmp, <2 x i1>* %b - %pop1 = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) nounwind readnone - %pop1.cmp = icmp eq <2 x i32> %pop1, < i32 32, i32 32 > - store volatile <2 x i1> %pop1.cmp, <2 x i1>* %b - ret void -; CHECK-LABEL: @ctpop_cmp_vec( -; CHECK-NEXT: %pop0.cmp = icmp eq <2 x i32> %a, zeroinitializer -; CHECK-NEXT: store volatile <2 x i1> %pop0.cmp, <2 x i1>* %b -; CHECK-NEXT: %pop1.cmp = icmp eq <2 x i32> %a, <i32 -1, i32 -1> -; CHECK-NEXT: store volatile <2 x i1> %pop1.cmp, <2 x i1>* %b -} - define i32 @ctlz_undef(i32 %Value) { ; CHECK-LABEL: @ctlz_undef( ; CHECK-NEXT: ret i32 undef diff --git a/test/Transforms/InstCombine/or-xor.ll b/test/Transforms/InstCombine/or-xor.ll index 2164f0df8d279..947971c6c83b0 100644 --- a/test/Transforms/InstCombine/or-xor.ll +++ b/test/Transforms/InstCombine/or-xor.ll @@ -348,10 +348,8 @@ define i8 @test18(i8 %A, i8 %B) { ; ((x | y) ^ (~x | ~y)) -> ~(x ^ y) define i32 @test19(i32 %x, i32 %y) { ; CHECK-LABEL: @test19( -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[OR2_DEMORGAN:%.*]] = and i32 [[X]], [[Y]] -; CHECK-NEXT: [[OR2:%.*]] = xor i32 [[OR2_DEMORGAN]], -1 -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR1]], [[OR2]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1 ; CHECK-NEXT: ret i32 [[XOR]] ; %noty = xor i32 %y, -1 @@ -365,10 +363,8 @@ define i32 @test19(i32 %x, i32 %y) { ; ((x | y) ^ (~y | ~x)) -> ~(x ^ y) define i32 @test20(i32 %x, i32 %y) { ; CHECK-LABEL: @test20( -; CHECK-NEXT: [[OR1:%.*]] = or i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[OR2_DEMORGAN:%.*]] = and i32 [[Y]], [[X]] -; CHECK-NEXT: [[OR2:%.*]] = xor i32 [[OR2_DEMORGAN]], -1 -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR1]], [[OR2]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1 ; CHECK-NEXT: ret i32 [[XOR]] ; %noty = xor i32 %y, -1 @@ -382,10 +378,8 @@ define i32 @test20(i32 %x, i32 %y) { ; ((~x | ~y) ^ (x | y)) -> ~(x ^ y) define i32 @test21(i32 %x, i32 %y) { ; CHECK-LABEL: @test21( -; CHECK-NEXT: [[OR1_DEMORGAN:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[OR1:%.*]] = xor i32 [[OR1_DEMORGAN]], -1 -; CHECK-NEXT: [[OR2:%.*]] = or i32 [[X]], [[Y]] -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR2]], [[OR1]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1 ; CHECK-NEXT: ret i32 [[XOR]] ; %noty = xor i32 %y, -1 @@ -399,10 +393,8 @@ define i32 @test21(i32 %x, i32 %y) { ; ((~x | ~y) ^ (y | x)) -> ~(x ^ y) define i32 @test22(i32 %x, i32 %y) { ; CHECK-LABEL: @test22( -; CHECK-NEXT: [[OR1_DEMORGAN:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[OR1:%.*]] = xor i32 [[OR1_DEMORGAN]], -1 -; CHECK-NEXT: [[OR2:%.*]] = or i32 [[Y]], [[X]] -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[OR2]], [[OR1]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], -1 ; CHECK-NEXT: ret i32 [[XOR]] ; %noty = xor i32 %y, -1 diff --git a/test/Transforms/InstCombine/pr33689_same_bitwidth.ll b/test/Transforms/InstCombine/pr33689_same_bitwidth.ll new file mode 100644 index 0000000000000..e5dd019b9b519 --- /dev/null +++ b/test/Transforms/InstCombine/pr33689_same_bitwidth.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -instcombine %s -o - | FileCheck %s + +; All the "useless" instructions should be removed and we shouldn't crash. + +target datalayout = "p:16:16" + +%i64_t = type i64 + +@a = external global i16 +@b = external global i16* + +define void @f() { +; CHECK-LABEL: @f( +; CHECK-NEXT: bb0: +; CHECK-NEXT: [[TMP12:%.*]] = alloca [2 x i32], align 8 +; CHECK-NEXT: [[TMP12_SUB:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[TMP12]], i16 0, i16 0 +; CHECK-NEXT: br i1 undef, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint [2 x i32]* [[TMP12]] to i16 +; CHECK-NEXT: store i16 [[TMP8]], i16* @a, align 2 +; CHECK-NEXT: unreachable +; CHECK: bb2: +; CHECK-NEXT: [[TMP9:%.*]] = load i16*, i16** @b, align 2 +; CHECK-NEXT: store i16 0, i16* [[TMP9]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP12_SUB]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], -1 +; CHECK-NEXT: store i32 [[TMP11]], i32* [[TMP12_SUB]], align 8 +; CHECK-NEXT: ret void +; +bb0: + %tmp1 = alloca %i64_t + %tmp2 = bitcast %i64_t* %tmp1 to i32* + %useless3 = bitcast %i64_t* %tmp1 to i16* + %useless4 = getelementptr inbounds i16, i16* %useless3, i16 undef + %useless5 = bitcast i16* %useless4 to i32* + br i1 undef, label %bb1, label %bb2 + +bb1: ; preds = %bb0 + %useless6 = insertvalue [1 x i32*] undef, i32* %tmp2, 0 + %useless7 = insertvalue [1 x i32*] %useless6, i32* null, 0 + %tmp8 = ptrtoint i32* %tmp2 to i16 + store i16 %tmp8, i16* @a + unreachable + +bb2: ; preds = %bb0 + %tmp9 = load i16*, i16** @b + store i16 0, i16* %tmp9 + %tmp10 = load i32, i32* %tmp2 + %tmp11 = sub i32 %tmp10, 1 + store i32 %tmp11, i32* %tmp2 + ret void +} diff --git a/test/Transforms/InstCombine/select-implied.ll b/test/Transforms/InstCombine/select-implied.ll index 2100e3eae0089..2558745c18f3c 100644 --- a/test/Transforms/InstCombine/select-implied.ll +++ b/test/Transforms/InstCombine/select-implied.ll @@ -121,3 +121,80 @@ end: declare void @foo(i32) declare i32 @bar(i32) + +; CHECK-LABEL: @test_and +; CHECK: tpath: +; CHECK-NOT: select +; CHECK: ret i32 313 +define i32 @test_and(i32 %a, i32 %b) { +entry: + %cmp1 = icmp ne i32 %a, 0 + %cmp2 = icmp ne i32 %b, 0 + %and = and i1 %cmp1, %cmp2 + br i1 %and, label %tpath, label %end + +tpath: + %cmp3 = icmp eq i32 %a, 0 ;; <-- implied false + %c = select i1 %cmp3, i32 0, i32 313 + ret i32 %c + +end: + ret i32 0 +} + +; cmp1 and cmp2 are false on the 'fpath' path and thus cmp3 is true. +; CHECK-LABEL: @test_or1 +; CHECK: fpath: +; CHECK-NOT: select +; CHECK: ret i32 37 +define i32 @test_or1(i32 %a, i32 %b) { +entry: + %cmp1 = icmp eq i32 %a, 0 + %cmp2 = icmp eq i32 %b, 0 + %or = or i1 %cmp1, %cmp2 + br i1 %or, label %end, label %fpath + +fpath: + %cmp3 = icmp ne i32 %a, 0 ;; <-- implied true + %c = select i1 %cmp3, i32 37, i32 0 + ret i32 %c + +end: + ret i32 0 +} + +; LHS ==> RHS by definition (true -> true) +; CHECK-LABEL: @test6 +; CHECK: taken: +; CHECK-NOT: select +; CHECK: call void @foo(i32 10) +define void @test6(i32 %a, i32 %b) { + %cmp1 = icmp eq i32 %a, %b + br i1 %cmp1, label %taken, label %end + +taken: + %c = select i1 %cmp1, i32 10, i32 0 + call void @foo(i32 %c) + br label %end + +end: + ret void +} + +; LHS ==> RHS by definition (false -> false) +; CHECK-LABEL: @test7 +; CHECK: taken: +; CHECK-NOT: select +; CHECK: call void @foo(i32 11) +define void @test7(i32 %a, i32 %b) { + %cmp1 = icmp eq i32 %a, %b + br i1 %cmp1, label %end, label %taken + +taken: + %c = select i1 %cmp1, i32 0, i32 11 + call void @foo(i32 %c) + br label %end + +end: + ret void +} diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll index c8f2a50b72eda..acfa053daaf8d 100644 --- a/test/Transforms/InstCombine/select.ll +++ b/test/Transforms/InstCombine/select.ll @@ -1370,3 +1370,10 @@ define i8 @assume_cond_false(i1 %cond, i8 %x, i8 %y) { ret i8 %sel } +; Test case to make sure we don't consider an all ones float values for converting the select into a sext. +define <4 x float> @PR33721(<4 x float> %w) { +entry: + %0 = fcmp ole <4 x float> %w, zeroinitializer + %1 = select <4 x i1> %0, <4 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000>, <4 x float> zeroinitializer + ret <4 x float> %1 +} diff --git a/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll b/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll index 5938f9d7321d6..715c9413a8196 100644 --- a/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll +++ b/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll @@ -854,3 +854,32 @@ define void @load_factor2_fp128(<4 x fp128>* %ptr) { %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 1, i32 3> ret void } + +define void @load_factor2_wide_pointer(<16 x i32*>* %ptr) { +; NEON-LABEL: @load_factor2_wide_pointer( +; NEON-NEXT: [[TMP1:%.*]] = bitcast <16 x i32*>* %ptr to i32* +; NEON-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8* +; NEON-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4) +; NEON-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1 +; NEON-NEXT: [[TMP4:%.*]] = inttoptr <4 x i32> [[TMP3]] to <4 x i32*> +; NEON-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0 +; NEON-NEXT: [[TMP6:%.*]] = inttoptr <4 x i32> [[TMP5]] to <4 x i32*> +; NEON-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8 +; NEON-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +; NEON-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP8]], i32 4) +; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1 +; NEON-NEXT: [[TMP10:%.*]] = inttoptr <4 x i32> [[TMP9]] to <4 x i32*> +; NEON-NEXT: [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0 +; NEON-NEXT: [[TMP12:%.*]] = inttoptr <4 x i32> [[TMP11]] to <4 x i32*> +; NEON-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32*> [[TMP4]], <4 x i32*> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; NEON-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> +; NEON-NEXT: ret void +; NO_NEON-LABEL: @load_factor2_wide_pointer( +; NO_NEON-NOT: @llvm.arm.neon +; NO_NEON: ret void +; + %interleaved.vec = load <16 x i32*>, <16 x i32*>* %ptr, align 4 + %v0 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %v1 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + ret void +} diff --git a/test/Transforms/LoopRotate/pr33701.ll b/test/Transforms/LoopRotate/pr33701.ll new file mode 100644 index 0000000000000..ed162b1209828 --- /dev/null +++ b/test/Transforms/LoopRotate/pr33701.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -disable-output + +define void @func() { +bb0: + br label %bb1 + +bb1: ; preds = %bb4, %bb0 + %0 = phi i16 [ %2, %bb4 ], [ 0, %bb0 ] + %1 = icmp sle i16 %0, 2 + br i1 %1, label %bb2, label %bb5 + +bb2: ; preds = %bb1 + br i1 undef, label %bb6, label %bb4 + +bb3: ; No predecessors! + br label %bb6 + +bb4: ; preds = %bb2 + %2 = add i16 undef, 1 + br label %bb1 + +bb5: ; preds = %bb1 + br label %bb6 + +bb6: ; preds = %bb5, %bb3, %bb2 + unreachable +} diff --git a/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll b/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll index dcd068191e105..ea3f607723197 100644 --- a/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll +++ b/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll @@ -14,8 +14,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; current LSR cost model. ; CHECK-NOT: = ptrtoint i8* undef to i64 ; CHECK: .lr.ph -; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp5, 1 -; CHECK: sub i64 [[TMP]], %tmp6 +; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp{{[0-9]+}}, -1 +; CHECK: sub i64 [[TMP]], %tmp{{[0-9]+}} ; CHECK: ret void define void @VerifyDiagnosticConsumerTest() unnamed_addr nounwind uwtable align 2 { bb: diff --git a/test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll b/test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll new file mode 100644 index 0000000000000..4ce6f1a79fbfb --- /dev/null +++ b/test/Transforms/LoopStrengthReduce/X86/lsr-filtering-scaledreg.ll @@ -0,0 +1,60 @@ +; RUN: opt < %s -loop-reduce -lsr-filter-same-scaled-reg=true -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%struct.ham = type { i8, i8, [5 x i32], i64, i64, i64 } + +@global = external local_unnamed_addr global %struct.ham, align 8 + +define void @foo() local_unnamed_addr { +bb: + %tmp = load i64, i64* getelementptr inbounds (%struct.ham, %struct.ham* @global, i64 0, i32 3), align 8 + %tmp1 = and i64 %tmp, 1792 + %tmp2 = load i64, i64* getelementptr inbounds (%struct.ham, %struct.ham* @global, i64 0, i32 4), align 8 + %tmp3 = add i64 %tmp1, %tmp2 + %tmp4 = load i8*, i8** null, align 8 + %tmp5 = getelementptr inbounds i8, i8* %tmp4, i64 0 + %tmp6 = sub i64 0, %tmp3 + %tmp7 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp6 + %tmp8 = inttoptr i64 0 to i8* + br label %bb9 + +; Without filtering non-optimal formulae with the same ScaledReg and Scale, the strategy +; to narrow LSR search space by picking winner reg will generate only one lsr.iv and +; unoptimal result. +; CHECK-LABEL: @foo( +; CHECK: bb9: +; CHECK-NEXT: = phi i8* +; CHECK-NEXT: = phi i8* + +bb9: ; preds = %bb12, %bb + %tmp10 = phi i8* [ %tmp7, %bb ], [ %tmp16, %bb12 ] + %tmp11 = phi i8* [ %tmp8, %bb ], [ %tmp17, %bb12 ] + br i1 false, label %bb18, label %bb12 + +bb12: ; preds = %bb9 + %tmp13 = getelementptr inbounds i8, i8* %tmp10, i64 8 + %tmp14 = bitcast i8* %tmp13 to i64* + %tmp15 = load i64, i64* %tmp14, align 1 + %tmp16 = getelementptr inbounds i8, i8* %tmp10, i64 16 + %tmp17 = getelementptr inbounds i8, i8* %tmp11, i64 16 + br label %bb9 + +bb18: ; preds = %bb9 + %tmp19 = icmp ugt i8* %tmp11, null + %tmp20 = getelementptr inbounds i8, i8* %tmp10, i64 8 + %tmp21 = getelementptr inbounds i8, i8* %tmp11, i64 8 + %tmp22 = select i1 %tmp19, i8* %tmp10, i8* %tmp20 + %tmp23 = select i1 %tmp19, i8* %tmp11, i8* %tmp21 + br label %bb24 + +bb24: ; preds = %bb24, %bb18 + %tmp25 = phi i8* [ %tmp27, %bb24 ], [ %tmp22, %bb18 ] + %tmp26 = phi i8* [ %tmp29, %bb24 ], [ %tmp23, %bb18 ] + %tmp27 = getelementptr inbounds i8, i8* %tmp25, i64 1 + %tmp28 = load i8, i8* %tmp25, align 1 + %tmp29 = getelementptr inbounds i8, i8* %tmp26, i64 1 + store i8 %tmp28, i8* %tmp26, align 1 + %tmp30 = icmp eq i8* %tmp29, %tmp5 + br label %bb24 +} diff --git a/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll b/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll index 1f31a133e34d9..73672e14f78ac 100644 --- a/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll +++ b/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll @@ -1,29 +1,52 @@ -; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-runtime-multi-exit=true -verify-dom-info -verify-loop-info -instcombine -S| FileCheck %s +; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-runtime-multi-exit=true -verify-dom-info -verify-loop-info -S | FileCheck %s -check-prefix=EPILOG-NO-IC +; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-runtime-multi-exit=true -verify-dom-info -verify-loop-info -instcombine -S | FileCheck %s -check-prefix=EPILOG ; RUN: opt < %s -loop-unroll -unroll-runtime -unroll-count=2 -unroll-runtime-epilog=true -unroll-runtime-multi-exit=true -verify-dom-info -verify-loop-info -instcombine +; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false -unroll-runtime-multi-exit=true -verify-dom-info -verify-loop-info -instcombine -S | FileCheck %s -check-prefix=PROLOG +; RUN: opt < %s -loop-unroll -unroll-runtime -unroll-runtime-epilog=false -unroll-count=2 -unroll-runtime-multi-exit=true -verify-dom-info -verify-loop-info -instcombine -; the second RUN generates an epilog remainder block for all the test +; the third and fifth RUNs generate an epilog/prolog remainder block for all the test ; cases below (it does not generate a loop). ; test with three exiting and three exit blocks. ; none of the exit blocks have successors define void @test1(i64 %trip, i1 %cond) { -; CHECK-LABEL: test1 -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[TRIP:%.*]], -1 -; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TRIP]], 7 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7 -; CHECK-NEXT: br i1 [[TMP1]], label %exit2.loopexit.unr-lcssa, label [[ENTRY_NEW:%.*]] -; CHECK: entry.new: -; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[TRIP]], [[XTRAITER]] -; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] -; CHECK-LABEL: loop_latch.epil: -; CHECK-NEXT: %epil.iter.sub = add i64 %epil.iter, -1 -; CHECK-NEXT: %epil.iter.cmp = icmp eq i64 %epil.iter.sub, 0 -; CHECK-NEXT: br i1 %epil.iter.cmp, label %exit2.loopexit.epilog-lcssa, label %loop_header.epil -; CHECK-LABEL: loop_latch.7: -; CHECK-NEXT: %niter.nsub.7 = add i64 %niter, -8 -; CHECK-NEXT: %niter.ncmp.7 = icmp eq i64 %niter.nsub.7, 0 -; CHECK-NEXT: br i1 %niter.ncmp.7, label %exit2.loopexit.unr-lcssa.loopexit, label %loop_header +; EPILOG: test1( +; EPILOG-NEXT: entry: +; EPILOG-NEXT: [[TMP0:%.*]] = add i64 [[TRIP:%.*]], -1 +; EPILOG-NEXT: [[XTRAITER:%.*]] = and i64 [[TRIP]], 7 +; EPILOG-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7 +; EPILOG-NEXT: br i1 [[TMP1]], label %exit2.loopexit.unr-lcssa, label [[ENTRY_NEW:%.*]] +; EPILOG: entry.new: +; EPILOG-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[TRIP]], [[XTRAITER]] +; EPILOG-NEXT: br label [[LOOP_HEADER:%.*]] +; EPILOG: loop_latch.epil: +; EPILOG-NEXT: %epil.iter.sub = add i64 %epil.iter, -1 +; EPILOG-NEXT: %epil.iter.cmp = icmp eq i64 %epil.iter.sub, 0 +; EPILOG-NEXT: br i1 %epil.iter.cmp, label %exit2.loopexit.epilog-lcssa, label %loop_header.epil +; EPILOG: loop_latch.7: +; EPILOG-NEXT: %niter.nsub.7 = add i64 %niter, -8 +; EPILOG-NEXT: %niter.ncmp.7 = icmp eq i64 %niter.nsub.7, 0 +; EPILOG-NEXT: br i1 %niter.ncmp.7, label %exit2.loopexit.unr-lcssa.loopexit, label %loop_header + +; PROLOG: test1( +; PROLOG-NEXT: entry: +; PROLOG-NEXT: [[TMP0:%.*]] = add i64 [[TRIP:%.*]], -1 +; PROLOG-NEXT: [[XTRAITER:%.*]] = and i64 [[TRIP]], 7 +; PROLOG-NEXT: [[TMP1:%.*]] = icmp eq i64 [[XTRAITER]], 0 +; PROLOG-NEXT: br i1 [[TMP1]], label %loop_header.prol.loopexit, label %loop_header.prol.preheader +; PROLOG: loop_header.prol: +; PROLOG-NEXT: %iv.prol = phi i64 [ 0, %loop_header.prol.preheader ], [ %iv_next.prol, %loop_latch.prol ] +; PROLOG-NEXT: %prol.iter = phi i64 [ [[XTRAITER]], %loop_header.prol.preheader ], [ %prol.iter.sub, %loop_latch.prol ] +; PROLOG-NEXT: br i1 %cond, label %loop_latch.prol, label %loop_exiting_bb1.prol +; PROLOG: loop_latch.prol: +; PROLOG-NEXT: %iv_next.prol = add i64 %iv.prol, 1 +; PROLOG-NEXT: %prol.iter.sub = add i64 %prol.iter, -1 +; PROLOG-NEXT: %prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0 +; PROLOG-NEXT: br i1 %prol.iter.cmp, label %loop_header.prol.loopexit.unr-lcssa, label %loop_header.prol +; PROLOG: loop_latch.7: +; PROLOG-NEXT: %iv_next.7 = add i64 %iv, 8 +; PROLOG-NEXT: %cmp.7 = icmp eq i64 %iv_next.7, %trip +; PROLOG-NEXT: br i1 %cmp.7, label %exit2.loopexit.unr-lcssa, label %loop_header entry: br label %loop_header @@ -59,17 +82,30 @@ exit2.loopexit: ; %sum.02 and %add. Both of these are incoming values for phi from every exiting ; unrolled block. define i32 @test2(i32* nocapture %a, i64 %n) { -; CHECK-LABEL: test2 -; CHECK-LABEL: for.exit2.loopexit: -; CHECK-NEXT: %retval.ph = phi i32 [ 42, %for.exiting_block ], [ %sum.02, %header ], [ %add, %for.body ], [ 42, %for.exiting_block.1 ], [ %add.1, %for.body.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %for.body.2 ], [ 42, %for.exiting_block.3 ], -; CHECK-NEXT: br label %for.exit2 -; CHECK-LABEL: for.exit2.loopexit2: -; CHECK-NEXT: %retval.ph3 = phi i32 [ 42, %for.exiting_block.epil ], [ %sum.02.epil, %header.epil ] -; CHECK-NEXT: br label %for.exit2 -; CHECK-LABEL: for.exit2: -; CHECK-NEXT: %retval = phi i32 [ %retval.ph, %for.exit2.loopexit ], [ %retval.ph3, %for.exit2.loopexit2 ] -; CHECK-NEXT: ret i32 %retval -; CHECK: %niter.nsub.7 = add i64 %niter, -8 +; EPILOG: test2( +; EPILOG: for.exit2.loopexit: +; EPILOG-NEXT: %retval.ph = phi i32 [ 42, %for.exiting_block ], [ %sum.02, %header ], [ %add, %for.body ], [ 42, %for.exiting_block.1 ], [ %add.1, %for.body.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %for.body.2 ], [ 42, %for.exiting_block.3 ], +; EPILOG-NEXT: br label %for.exit2 +; EPILOG: for.exit2.loopexit2: +; EPILOG-NEXT: %retval.ph3 = phi i32 [ 42, %for.exiting_block.epil ], [ %sum.02.epil, %header.epil ] +; EPILOG-NEXT: br label %for.exit2 +; EPILOG: for.exit2: +; EPILOG-NEXT: %retval = phi i32 [ %retval.ph, %for.exit2.loopexit ], [ %retval.ph3, %for.exit2.loopexit2 ] +; EPILOG-NEXT: ret i32 %retval +; EPILOG: %niter.nsub.7 = add i64 %niter, -8 + +; PROLOG: test2( +; PROLOG: for.exit2.loopexit: +; PROLOG-NEXT: %retval.ph = phi i32 [ 42, %for.exiting_block ], [ %sum.02, %header ], [ %add, %for.body ], [ 42, %for.exiting_block.1 ], [ %add.1, %for.body.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %for.body.2 ], [ 42, %for.exiting_block.3 ], +; PROLOG-NEXT: br label %for.exit2 +; PROLOG: for.exit2.loopexit1: +; PROLOG-NEXT: %retval.ph2 = phi i32 [ 42, %for.exiting_block.prol ], [ %sum.02.prol, %header.prol ] +; PROLOG-NEXT: br label %for.exit2 +; PROLOG: for.exit2: +; PROLOG-NEXT: %retval = phi i32 [ %retval.ph, %for.exit2.loopexit ], [ %retval.ph2, %for.exit2.loopexit1 ] +; PROLOG-NEXT: ret i32 %retval +; PROLOG: %indvars.iv.next.7 = add i64 %indvars.iv, 8 + entry: br label %header @@ -102,25 +138,42 @@ for.exit2: ; test with two exiting and three exit blocks. ; the non-latch exiting block has a switch. define void @test3(i64 %trip, i64 %add) { -; CHECK-LABEL: test3 -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[TRIP:%.*]], -1 -; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TRIP]], 7 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7 -; CHECK-NEXT: br i1 [[TMP1]], label %exit2.loopexit.unr-lcssa, label [[ENTRY_NEW:%.*]] -; CHECK: entry.new: -; CHECK-NEXT: %unroll_iter = sub i64 [[TRIP]], [[XTRAITER]] -; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] -; CHECK-LABEL: loop_header: -; CHECK-NEXT: %sum = phi i64 [ 0, %entry.new ], [ %sum.next.7, %loop_latch.7 ] -; CHECK-NEXT: %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.7, %loop_latch.7 ] -; CHECK-LABEL: loop_exiting_bb1.7: -; CHECK-NEXT: switch i64 %sum.next.6, label %loop_latch.7 -; CHECK-LABEL: loop_latch.7: -; CHECK-NEXT: %sum.next.7 = add i64 %sum.next.6, %add -; CHECK-NEXT: %niter.nsub.7 = add i64 %niter, -8 -; CHECK-NEXT: %niter.ncmp.7 = icmp eq i64 %niter.nsub.7, 0 -; CHECK-NEXT: br i1 %niter.ncmp.7, label %exit2.loopexit.unr-lcssa.loopexit, label %loop_header +; EPILOG: test3( +; EPILOG-NEXT: entry: +; EPILOG-NEXT: [[TMP0:%.*]] = add i64 [[TRIP:%.*]], -1 +; EPILOG-NEXT: [[XTRAITER:%.*]] = and i64 [[TRIP]], 7 +; EPILOG-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7 +; EPILOG-NEXT: br i1 [[TMP1]], label %exit2.loopexit.unr-lcssa, label [[ENTRY_NEW:%.*]] +; EPILOG: entry.new: +; EPILOG-NEXT: %unroll_iter = sub i64 [[TRIP]], [[XTRAITER]] +; EPILOG-NEXT: br label [[LOOP_HEADER:%.*]] +; EPILOG: loop_header: +; EPILOG-NEXT: %sum = phi i64 [ 0, %entry.new ], [ %sum.next.7, %loop_latch.7 ] +; EPILOG-NEXT: %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.7, %loop_latch.7 ] +; EPILOG: loop_exiting_bb1.7: +; EPILOG-NEXT: switch i64 %sum.next.6, label %loop_latch.7 +; EPILOG: loop_latch.7: +; EPILOG-NEXT: %sum.next.7 = add i64 %sum.next.6, %add +; EPILOG-NEXT: %niter.nsub.7 = add i64 %niter, -8 +; EPILOG-NEXT: %niter.ncmp.7 = icmp eq i64 %niter.nsub.7, 0 +; EPILOG-NEXT: br i1 %niter.ncmp.7, label %exit2.loopexit.unr-lcssa.loopexit, label %loop_header + +; PROLOG: test3( +; PROLOG-NEXT: entry: +; PROLOG-NEXT: [[TMP0:%.*]] = add i64 [[TRIP:%.*]], -1 +; PROLOG-NEXT: [[XTRAITER:%.*]] = and i64 [[TRIP]], 7 +; PROLOG-NEXT: [[TMP1:%.*]] = icmp eq i64 [[XTRAITER]], 0 +; PROLOG-NEXT: br i1 [[TMP1]], label %loop_header.prol.loopexit, label %loop_header.prol.preheader +; PROLOG: loop_header: +; PROLOG-NEXT: %iv = phi i64 [ %iv.unr, %entry.new ], [ %iv_next.7, %loop_latch.7 ] +; PROLOG-NEXT: %sum = phi i64 [ %sum.unr, %entry.new ], [ %sum.next.7, %loop_latch.7 ] +; PROLOG: loop_exiting_bb1.7: +; PROLOG-NEXT: switch i64 %sum.next.6, label %loop_latch.7 +; PROLOG: loop_latch.7: +; PROLOG-NEXT: %iv_next.7 = add nsw i64 %iv, 8 +; PROLOG-NEXT: %sum.next.7 = add i64 %sum.next.6, %add +; PROLOG-NEXT: %cmp.7 = icmp eq i64 %iv_next.7, %trip +; PROLOG-NEXT: br i1 %cmp.7, label %exit2.loopexit.unr-lcssa, label %loop_header entry: br label %loop_header @@ -153,9 +206,13 @@ exit2.loopexit: ; FIXME: Support multiple exiting blocks to the same latch exit block. define i32 @test4(i32* nocapture %a, i64 %n, i1 %cond) { -; CHECK-LABEL: test4 -; CHECK-NOT: .unr -; CHECK-NOT: .epil +; EPILOG: test4( +; EPILOG-NOT: .unr +; EPILOG-NOT: .epil + +; PROLOG: test4( +; PROLOG-NOT: .unr +; PROLOG-NOT: .prol entry: br label %header @@ -184,21 +241,68 @@ for.exit2: ret i32 42 } +; FIXME: Support multiple exiting blocks to the unique exit block. +define void @unique_exit(i32 %arg) { +; EPILOG: unique_exit( +; EPILOG-NOT: .unr +; EPILOG-NOT: .epil + +; PROLOG: unique_exit( +; PROLOG-NOT: .unr +; PROLOG-NOT: .prol +entry: + %tmp = icmp sgt i32 undef, %arg + br i1 %tmp, label %preheader, label %returnblock + +preheader: ; preds = %entry + br label %header + +LoopExit: ; preds = %header, %latch + %tmp2.ph = phi i32 [ %tmp4, %header ], [ -1, %latch ] + br label %returnblock + +returnblock: ; preds = %LoopExit, %entry + %tmp2 = phi i32 [ -1, %entry ], [ %tmp2.ph, %LoopExit ] + ret void + +header: ; preds = %preheader, %latch + %tmp4 = phi i32 [ %inc, %latch ], [ %arg, %preheader ] + %inc = add nsw i32 %tmp4, 1 + br i1 true, label %LoopExit, label %latch + +latch: ; preds = %header + %cmp = icmp slt i32 %inc, undef + br i1 %cmp, label %header, label %LoopExit +} + ; two exiting and two exit blocks. ; the non-latch exiting block has duplicate edges to the non-latch exit block. define i64 @test5(i64 %trip, i64 %add, i1 %cond) { -; CHECK-LABEL: test5 -; CHECK-LABEL: exit1.loopexit: -; CHECK-NEXT: %result.ph = phi i64 [ %ivy, %loop_exiting ], [ %ivy, %loop_exiting ], [ %ivy.1, %loop_exiting.1 ], [ %ivy.1, %loop_exiting.1 ], [ %ivy.2, %loop_exiting.2 ], -; CHECK-NEXT: br label %exit1 -; CHECK-LABEL: exit1.loopexit2: -; CHECK-NEXT: %ivy.epil = add i64 %iv.epil, %add -; CHECK-NEXT: br label %exit1 -; CHECK-LABEL: exit1: -; CHECK-NEXT: %result = phi i64 [ %result.ph, %exit1.loopexit ], [ %ivy.epil, %exit1.loopexit2 ] -; CHECK-NEXT: ret i64 %result -; CHECK-LABEL: loop_latch.7: -; CHECK: %niter.nsub.7 = add i64 %niter, -8 +; EPILOG: test5( +; EPILOG: exit1.loopexit: +; EPILOG-NEXT: %result.ph = phi i64 [ %ivy, %loop_exiting ], [ %ivy, %loop_exiting ], [ %ivy.1, %loop_exiting.1 ], [ %ivy.1, %loop_exiting.1 ], [ %ivy.2, %loop_exiting.2 ], +; EPILOG-NEXT: br label %exit1 +; EPILOG: exit1.loopexit2: +; EPILOG-NEXT: %ivy.epil = add i64 %iv.epil, %add +; EPILOG-NEXT: br label %exit1 +; EPILOG: exit1: +; EPILOG-NEXT: %result = phi i64 [ %result.ph, %exit1.loopexit ], [ %ivy.epil, %exit1.loopexit2 ] +; EPILOG-NEXT: ret i64 %result +; EPILOG: loop_latch.7: +; EPILOG: %niter.nsub.7 = add i64 %niter, -8 + +; PROLOG: test5( +; PROLOG: exit1.loopexit: +; PROLOG-NEXT: %result.ph = phi i64 [ %ivy, %loop_exiting ], [ %ivy, %loop_exiting ], [ %ivy.1, %loop_exiting.1 ], [ %ivy.1, %loop_exiting.1 ], [ %ivy.2, %loop_exiting.2 ], +; PROLOG-NEXT: br label %exit1 +; PROLOG: exit1.loopexit1: +; PROLOG-NEXT: %ivy.prol = add i64 %iv.prol, %add +; PROLOG-NEXT: br label %exit1 +; PROLOG: exit1: +; PROLOG-NEXT: %result = phi i64 [ %result.ph, %exit1.loopexit ], [ %ivy.prol, %exit1.loopexit1 ] +; PROLOG-NEXT: ret i64 %result +; PROLOG: loop_latch.7: +; PROLOG: %iv_next.7 = add nsw i64 %iv, 8 entry: br label %loop_header @@ -230,18 +334,31 @@ latchexit: ; test when exit blocks have successors. define i32 @test6(i32* nocapture %a, i64 %n, i1 %cond, i32 %x) { -; CHECK-LABEL: test6 -; CHECK-LABEL: for.exit2.loopexit: -; CHECK-NEXT: %retval.ph = phi i32 [ 42, %for.exiting_block ], [ %sum.02, %header ], [ %add, %latch ], [ 42, %for.exiting_block.1 ], [ %add.1, %latch.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %latch.2 ], -; CHECK-NEXT: br label %for.exit2 -; CHECK-LABEL: for.exit2.loopexit2: -; CHECK-NEXT: %retval.ph3 = phi i32 [ 42, %for.exiting_block.epil ], [ %sum.02.epil, %header.epil ] -; CHECK-NEXT: br label %for.exit2 -; CHECK-LABEL: for.exit2: -; CHECK-NEXT: %retval = phi i32 [ %retval.ph, %for.exit2.loopexit ], [ %retval.ph3, %for.exit2.loopexit2 ] -; CHECK-NEXT: br i1 %cond, label %exit_true, label %exit_false -; CHECK-LABEL: latch.7: -; CHECK: %niter.nsub.7 = add i64 %niter, -8 +; EPILOG: test6( +; EPILOG: for.exit2.loopexit: +; EPILOG-NEXT: %retval.ph = phi i32 [ 42, %for.exiting_block ], [ %sum.02, %header ], [ %add, %latch ], [ 42, %for.exiting_block.1 ], [ %add.1, %latch.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %latch.2 ], +; EPILOG-NEXT: br label %for.exit2 +; EPILOG: for.exit2.loopexit2: +; EPILOG-NEXT: %retval.ph3 = phi i32 [ 42, %for.exiting_block.epil ], [ %sum.02.epil, %header.epil ] +; EPILOG-NEXT: br label %for.exit2 +; EPILOG: for.exit2: +; EPILOG-NEXT: %retval = phi i32 [ %retval.ph, %for.exit2.loopexit ], [ %retval.ph3, %for.exit2.loopexit2 ] +; EPILOG-NEXT: br i1 %cond, label %exit_true, label %exit_false +; EPILOG: latch.7: +; EPILOG: %niter.nsub.7 = add i64 %niter, -8 + +; PROLOG: test6( +; PROLOG: for.exit2.loopexit: +; PROLOG-NEXT: %retval.ph = phi i32 [ 42, %for.exiting_block ], [ %sum.02, %header ], [ %add, %latch ], [ 42, %for.exiting_block.1 ], [ %add.1, %latch.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %latch.2 ], +; PROLOG-NEXT: br label %for.exit2 +; PROLOG: for.exit2.loopexit1: +; PROLOG-NEXT: %retval.ph2 = phi i32 [ 42, %for.exiting_block.prol ], [ %sum.02.prol, %header.prol ] +; PROLOG-NEXT: br label %for.exit2 +; PROLOG: for.exit2: +; PROLOG-NEXT: %retval = phi i32 [ %retval.ph, %for.exit2.loopexit ], [ %retval.ph2, %for.exit2.loopexit1 ] +; PROLOG-NEXT: br i1 %cond, label %exit_true, label %exit_false +; PROLOG: latch.7: +; PROLOG: %indvars.iv.next.7 = add i64 %indvars.iv, 8 entry: br label %header @@ -277,3 +394,87 @@ exit_true: exit_false: ret i32 %addx } + +; test when value in exit block does not have VMap. +define i32 @test7(i32 %arg, i32 %arg1, i32 %arg2) { +; EPILOG-NO-IC: test7( +; EPILOG-NO-IC: loopexit1.loopexit: +; EPILOG-NO-IC-NEXT: %sext3.ph = phi i32 [ %shft, %header ], [ %shft, %latch ], [ %shft, %latch.1 ], [ %shft, %latch.2 ], [ %shft, %latch.3 ], [ %shft, %latch.4 ], [ %shft, %latch.5 ], [ %shft, %latch.6 ] +; EPILOG-NO-IC-NEXT: br label %loopexit1 +; EPILOG-NO-IC: loopexit1.loopexit1: +; EPILOG-NO-IC-NEXT: %sext3.ph2 = phi i32 [ %shft, %header.epil ] +; EPILOG-NO-IC-NEXT: br label %loopexit1 +; EPILOG-NO-IC: loopexit1: +; EPILOG-NO-IC-NEXT: %sext3 = phi i32 [ %sext3.ph, %loopexit1.loopexit ], [ %sext3.ph2, %loopexit1.loopexit1 ] +bb: + %tmp = icmp slt i32 undef, 2 + %sext = sext i32 undef to i64 + %shft = ashr exact i32 %arg, 16 + br i1 %tmp, label %loopexit2, label %preheader + +preheader: ; preds = %bb2 + br label %header + +header: ; preds = %latch, %preheader + %tmp6 = phi i64 [ 1, %preheader ], [ %add, %latch ] + br i1 false, label %loopexit1, label %latch + +latch: ; preds = %header + %add = add nuw nsw i64 %tmp6, 1 + %tmp9 = icmp slt i64 %add, %sext + br i1 %tmp9, label %header, label %latchexit + +latchexit: ; preds = %latch + unreachable + +loopexit2: ; preds = %bb2 + ret i32 %shft + +loopexit1: ; preds = %header + %sext3 = phi i32 [ %shft, %header ] + ret i32 %sext3 +} + +; Nested loop and inner loop is unrolled +; FIXME: we cannot unroll with epilog remainder currently, because +; the outer loop does not contain the epilog preheader and epilog exit (while +; infact it should). This causes us to choke up on LCSSA form being incorrect in +; outer loop. However, the exit block where LCSSA fails, is infact still within +; the outer loop. For now, we just bail out in presence of outer loop and epilog +; loop is generated. +; The outer loop header is the preheader for the inner loop and the inner header +; branches back to the outer loop. +define void @test8() { +; EPILOG: test8( +; EPILOG-NOT: niter + +; PROLOG: test8( +; PROLOG: outerloop: +; PROLOG-NEXT: phi i64 [ 3, %bb ], [ 0, %outerloop.loopexit ] +; PROLOG: %lcmp.mod = icmp eq i64 +; PROLOG-NEXT: br i1 %lcmp.mod, label %innerH.prol.loopexit, label %innerH.prol.preheader +; PROLOG: latch.6: +; PROLOG-NEXT: %tmp4.7 = add nsw i64 %tmp3, 8 +; PROLOG-NEXT: br i1 false, label %outerloop.loopexit.loopexit, label %latch.7 +; PROLOG: latch.7 +; PROLOG-NEXT: %tmp6.7 = icmp ult i64 %tmp4.7, 100 +; PROLOG-NEXT: br i1 %tmp6.7, label %innerH, label %exit.unr-lcssa +bb: + br label %outerloop + +outerloop: ; preds = %innerH, %bb + %tmp = phi i64 [ 3, %bb ], [ 0, %innerH ] + br label %innerH + +innerH: ; preds = %latch, %outerloop + %tmp3 = phi i64 [ %tmp4, %latch ], [ %tmp, %outerloop ] + %tmp4 = add nuw nsw i64 %tmp3, 1 + br i1 false, label %outerloop, label %latch + +latch: ; preds = %innerH + %tmp6 = icmp ult i64 %tmp4, 100 + br i1 %tmp6, label %innerH, label %exit + +exit: ; preds = %latch + ret void +} diff --git a/test/Transforms/LoopUnroll/runtime-loop.ll b/test/Transforms/LoopUnroll/runtime-loop.ll index 04661314eb1d7..878f4e8c78f0f 100644 --- a/test/Transforms/LoopUnroll/runtime-loop.ll +++ b/test/Transforms/LoopUnroll/runtime-loop.ll @@ -170,6 +170,74 @@ for.end: ; preds = %for.cond.for.end_cr ret i16 %res.0.lcssa } +; dont unroll loop with multiple exit/exiting blocks, unless +; -runtime-unroll-multi-exit=true +; single exit, multiple exiting blocks. +define void @unique_exit(i32 %arg) { +; PROLOG: unique_exit( +; PROLOG-NOT: .unr + +; EPILOG: unique_exit( +; EPILOG-NOT: .unr +entry: + %tmp = icmp sgt i32 undef, %arg + br i1 %tmp, label %preheader, label %returnblock + +preheader: ; preds = %entry + br label %header + +LoopExit: ; preds = %header, %latch + %tmp2.ph = phi i32 [ %tmp4, %header ], [ -1, %latch ] + br label %returnblock + +returnblock: ; preds = %LoopExit, %entry + %tmp2 = phi i32 [ -1, %entry ], [ %tmp2.ph, %LoopExit ] + ret void + +header: ; preds = %preheader, %latch + %tmp4 = phi i32 [ %inc, %latch ], [ %arg, %preheader ] + %inc = add nsw i32 %tmp4, 1 + br i1 true, label %LoopExit, label %latch + +latch: ; preds = %header + %cmp = icmp slt i32 %inc, undef + br i1 %cmp, label %header, label %LoopExit +} + +; multiple exit blocks. don't unroll +define void @multi_exit(i64 %trip, i1 %cond) { +; PROLOG: multi_exit( +; PROLOG-NOT: .unr + +; EPILOG: multi_exit( +; EPILOG-NOT: .unr +entry: + br label %loop_header + +loop_header: + %iv = phi i64 [ 0, %entry ], [ %iv_next, %loop_latch ] + br i1 %cond, label %loop_latch, label %loop_exiting_bb1 + +loop_exiting_bb1: + br i1 false, label %loop_exiting_bb2, label %exit1 + +loop_exiting_bb2: + br i1 false, label %loop_latch, label %exit3 + +exit3: + ret void + +loop_latch: + %iv_next = add i64 %iv, 1 + %cmp = icmp ne i64 %iv_next, %trip + br i1 %cmp, label %loop_header, label %exit2.loopexit + +exit1: + ret void + +exit2.loopexit: + ret void +} !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.unroll.runtime.disable"} diff --git a/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll b/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll new file mode 100644 index 0000000000000..cd3e89ae73504 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll @@ -0,0 +1,49 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -mcpu=slm -debug 2>&1 | FileCheck -check-prefix=MSG %s +; REQUIRES: asserts +; This test should not be vectorized in X86\SLM arch +; Vectorizing the 64bit multiply in this case is wrong since +; it can be done with a lower bit mode (notice that the sources is 16bit) +; Also addq\subq (quad word) has a high cost on SLM arch. +; this test has a bad performance (regression of -70%) if vectorized on SLM arch +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @no_vec(i32 %LastIndex, i16* nocapture readonly %InputData, i16 signext %lag, i16 signext %Scale) { +entry: +; MSG: LV: Selecting VF: 1. + %cmp17 = icmp sgt i32 %LastIndex, 0 + br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %conv5 = sext i16 %Scale to i64 + %sh_prom = and i64 %conv5, 4294967295 + %0 = sext i16 %lag to i64 + %wide.trip.count = zext i32 %LastIndex to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %conv8 = trunc i64 %add7 to i32 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %Accumulator.0.lcssa = phi i32 [ 0, %entry ], [ %conv8, %for.cond.cleanup.loopexit ] + ret i32 %Accumulator.0.lcssa + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %Accumulator.018 = phi i64 [ 0, %for.body.lr.ph ], [ %add7, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %InputData, i64 %indvars.iv + %1 = load i16, i16* %arrayidx, align 2 + %conv = sext i16 %1 to i64 + %2 = add nsw i64 %indvars.iv, %0 + %arrayidx3 = getelementptr inbounds i16, i16* %InputData, i64 %2 + %3 = load i16, i16* %arrayidx3, align 2 + %conv4 = sext i16 %3 to i64 + %mul = mul nsw i64 %conv4, %conv + %shr = ashr i64 %mul, %sh_prom + %add7 = add i64 %shr, %Accumulator.018 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + diff --git a/test/Transforms/LoopVectorize/if-conversion-nest.ll b/test/Transforms/LoopVectorize/if-conversion-nest.ll index 3a581ebf847ec..7f381ae6ad7b5 100644 --- a/test/Transforms/LoopVectorize/if-conversion-nest.ll +++ b/test/Transforms/LoopVectorize/if-conversion-nest.ll @@ -1,18 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -;CHECK-LABEL: @foo( -;CHECK: icmp sgt -;CHECK: icmp sgt -;CHECK: icmp slt -;CHECK: select <4 x i1> -;CHECK: %[[P1:.*]] = select <4 x i1> -;CHECK: xor <4 x i1> -;CHECK: and <4 x i1> -;CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %[[P1]] -;CHECK: ret define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP26]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[MIN_ITERS_CHECKED:%.*]] +; CHECK: min.iters.checked: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[N]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[CMP_ZERO:%.*]] = icmp eq i64 [[N_VEC]], 0 +; CHECK-NEXT: br i1 [[CMP_ZERO]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD6]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 19, i32 19, i32 19, i32 19> +; CHECK-NEXT: [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], <i32 4, i32 4, i32 4, i32 4> +; CHECK-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 5, i32 5, i32 5, i32 5> +; CHECK-NEXT: [[TMP15:%.*]] = and <4 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> <i32 9, i32 9, i32 9, i32 9> +; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], <i1 true, i1 true, i1 true, i1 true> +; CHECK-NEXT: [[TMP17:%.*]] = and <4 x i1> [[TMP11]], [[TMP16]] +; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP17]], <4 x i32> [[TMP14]], <4 x i32> [[PREDPHI]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[PREDPHI7]], <4 x i32>* [[TMP18]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[MIN_ITERS_CHECKED]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]] +; CHECK-NEXT: br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[IF_END14]] +; CHECK: if.then: +; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP20]], 19 +; CHECK-NEXT: br i1 [[CMP6]], label [[IF_END14]], label [[IF_ELSE:%.*]] +; CHECK: if.else: +; CHECK-NEXT: [[CMP10:%.*]] = icmp slt i32 [[TMP21]], 4 +; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP10]], i32 4, i32 5 +; CHECK-NEXT: br label [[IF_END14]] +; CHECK: if.end14: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 9, [[FOR_BODY]] ], [ 3, [[IF_THEN]] ], [ [[DOT]], [[IF_ELSE]] ] +; CHECK-NEXT: store i32 [[X_0]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !8 +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret i32 undef +; entry: %cmp26 = icmp sgt i32 %n, 0 br i1 %cmp26, label %for.body, label %for.end @@ -46,3 +120,4 @@ if.end14: for.end: ret i32 undef } + diff --git a/test/Transforms/LoopVectorize/pr33706.ll b/test/Transforms/LoopVectorize/pr33706.ll new file mode 100644 index 0000000000000..b9d0d8a44accb --- /dev/null +++ b/test/Transforms/LoopVectorize/pr33706.ll @@ -0,0 +1,61 @@ +; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s | FileCheck %s + +@global = local_unnamed_addr global i32 0, align 4 +@global.1 = local_unnamed_addr global i32 0, align 4 +@global.2 = local_unnamed_addr global float 0x3EF0000000000000, align 4 + +; CHECK-LABEL: @PR33706 +; CHECK-NOT: <2 x i32> +define void @PR33706(float* nocapture readonly %arg, float* nocapture %arg1, i32 %arg2) local_unnamed_addr { +bb: + %tmp = load i32, i32* @global.1, align 4 + %tmp3 = getelementptr inbounds float, float* %arg, i64 190 + %tmp4 = getelementptr inbounds float, float* %arg1, i64 512 + %tmp5 = and i32 %tmp, 65535 + %tmp6 = icmp ugt i32 %arg2, 65536 + br i1 %tmp6, label %bb7, label %bb9 + +bb7: ; preds = %bb + %tmp8 = load i32, i32* @global, align 4 + br label %bb27 + +bb9: ; preds = %bb + %tmp10 = udiv i32 65536, %arg2 + br label %bb11 + +bb11: ; preds = %bb11, %bb9 + %tmp12 = phi i32 [ %tmp20, %bb11 ], [ %tmp5, %bb9 ] + %tmp13 = phi float* [ %tmp18, %bb11 ], [ %tmp4, %bb9 ] + %tmp14 = phi i32 [ %tmp16, %bb11 ], [ %tmp10, %bb9 ] + %tmp15 = phi i32 [ %tmp19, %bb11 ], [ %tmp, %bb9 ] + %tmp16 = add nsw i32 %tmp14, -1 + %tmp17 = sitofp i32 %tmp12 to float + store float %tmp17, float* %tmp13, align 4 + %tmp18 = getelementptr inbounds float, float* %tmp13, i64 1 + %tmp19 = add i32 %tmp15, %arg2 + %tmp20 = and i32 %tmp19, 65535 + %tmp21 = icmp eq i32 %tmp16, 0 + br i1 %tmp21, label %bb22, label %bb11 + +bb22: ; preds = %bb11 + %tmp23 = phi float* [ %tmp18, %bb11 ] + %tmp24 = phi i32 [ %tmp19, %bb11 ] + %tmp25 = phi i32 [ %tmp20, %bb11 ] + %tmp26 = ashr i32 %tmp24, 16 + store i32 %tmp26, i32* @global, align 4 + br label %bb27 + +bb27: ; preds = %bb22, %bb7 + %tmp28 = phi i32 [ %tmp26, %bb22 ], [ %tmp8, %bb7 ] + %tmp29 = phi float* [ %tmp23, %bb22 ], [ %tmp4, %bb7 ] + %tmp30 = phi i32 [ %tmp25, %bb22 ], [ %tmp5, %bb7 ] + %tmp31 = sext i32 %tmp28 to i64 + %tmp32 = getelementptr inbounds float, float* %tmp3, i64 %tmp31 + %tmp33 = load float, float* %tmp32, align 4 + %tmp34 = sitofp i32 %tmp30 to float + %tmp35 = load float, float* @global.2, align 4 + %tmp36 = fmul float %tmp35, %tmp34 + %tmp37 = fadd float %tmp33, %tmp36 + store float %tmp37, float* %tmp29, align 4 + ret void +} diff --git a/test/Transforms/LowerTypeTests/Inputs/import-icall.yaml b/test/Transforms/LowerTypeTests/Inputs/import-icall.yaml index 17b634acd0e1a..558aa9aa73f25 100644 --- a/test/Transforms/LowerTypeTests/Inputs/import-icall.yaml +++ b/test/Transforms/LowerTypeTests/Inputs/import-icall.yaml @@ -16,4 +16,5 @@ CfiFunctionDefs: CfiFunctionDecls: - external - external_weak + - local_decl ... diff --git a/test/Transforms/LowerTypeTests/import-icall.ll b/test/Transforms/LowerTypeTests/import-icall.ll index ddeb7fb5c9a2b..b4e374720321e 100644 --- a/test/Transforms/LowerTypeTests/import-icall.ll +++ b/test/Transforms/LowerTypeTests/import-icall.ll @@ -19,6 +19,10 @@ define i8 @use_b() { ret i8 %x } +define void @local_decl() { + call void @local_decl() + ret void +} declare void @external() declare extern_weak void @external_weak() @@ -33,6 +37,9 @@ declare extern_weak void @external_weak() ; CHECK: define internal i8 @local_b() { ; CHECK-NEXT: call i8 @local_a() +; CHECK: define void @local_decl() +; CHECK-NEXT: call void @local_decl() + ; CHECK: declare void @external() ; CHECK: declare extern_weak void @external_weak() ; CHECK: declare i8 @local_a() diff --git a/test/Transforms/NewGVN/pr33720.ll b/test/Transforms/NewGVN/pr33720.ll new file mode 100644 index 0000000000000..3b6c190a44944 --- /dev/null +++ b/test/Transforms/NewGVN/pr33720.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -newgvn -S %s | FileCheck %s + +@f = external local_unnamed_addr global i64 +@b = external local_unnamed_addr global i64 +@e = external local_unnamed_addr global i64 + +define void @patatino() { +; CHECK-LABEL: @patatino( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 undef, label [[IF_END24:%.*]], label [[FOR_COND16:%.*]] +; CHECK: for.cond2thread-pre-split: +; CHECK-NEXT: br i1 false, label [[FOR_BODY:%.*]], label [[FOR_COND8_PREHEADER:%.*]] +; CHECK: for.cond8.preheader: +; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label %for.cond11thread-pre-split.lr.ph +; CHECK: for.cond11thread-pre-split.lr.ph: +; CHECK-NEXT: br label [[L1]] +; CHECK: for.body: +; CHECK-NEXT: [[CMP3:%.*]] = icmp ne i64 [[K_2:%.*]], 3 +; CHECK-NEXT: [[CONV4:%.*]] = zext i1 [[CMP3]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* @f +; CHECK-NEXT: [[OR:%.*]] = or i64 [[TMP0]], [[CONV4]] +; CHECK-NEXT: store i64 [[OR]], i64* @f +; CHECK-NEXT: [[TOBOOL7:%.*]] = icmp ne i64 [[K_2]], 0 +; CHECK-NEXT: br i1 [[TOBOOL7]], label %for.cond2thread-pre-split, label [[LOR_RHS:%.*]] +; CHECK: lor.rhs: +; CHECK-NEXT: store i64 1, i64* @b, align 8 +; CHECK-NEXT: br label %for.cond2thread-pre-split +; CHECK: l1: +; CHECK-NEXT: [[K_2]] = phi i64 [ undef, [[L1_PREHEADER:%.*]] ], [ 15, [[FOR_COND8_PREHEADER]] ], [ 5, %for.cond11thread-pre-split.lr.ph ] +; CHECK-NEXT: store i64 7, i64* [[J_3:%.*]] +; CHECK-NEXT: br label [[FOR_BODY]] +; CHECK: for.cond16: +; CHECK-NEXT: [[J_0:%.*]] = phi i64* [ @f, [[ENTRY:%.*]] ], [ undef, [[FOR_COND20:%.*]] ], [ @e, [[FOR_COND16]] ] +; CHECK-NEXT: br i1 undef, label [[FOR_COND20]], label [[FOR_COND16]] +; CHECK: for.cond20: +; CHECK-NEXT: [[J_2:%.*]] = phi i64* [ [[J_0]], [[FOR_COND16]] ], [ undef, [[IF_END24]] ] +; CHECK-NEXT: br i1 true, label [[IF_END24]], label [[FOR_COND16]] +; CHECK: if.end24: +; CHECK-NEXT: [[J_3]] = phi i64* [ [[J_2]], [[FOR_COND20]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: br i1 false, label [[FOR_COND20]], label [[L1_PREHEADER]] +; CHECK: l1.preheader: +; CHECK-NEXT: br label [[L1]] +; +entry: + br i1 undef, label %if.end24, label %for.cond16 + +for.cond2thread-pre-split: + br i1 false, label %for.body, label %for.cond8.preheader + +for.cond8.preheader: + br i1 undef, label %l1, label %for.cond11thread-pre-split.lr.ph + +for.cond11thread-pre-split.lr.ph: + br label %l1 + +for.body: + %k.031 = phi i64 [ %k.2, %l1 ], [ 15, %for.cond2thread-pre-split ] + %cmp3 = icmp ne i64 %k.031, 3 + %conv4 = zext i1 %cmp3 to i64 + %0 = load i64, i64* @f + %or = or i64 %0, %conv4 + store i64 %or, i64* @f + %tobool7 = icmp ne i64 %k.031, 0 + %or.cond = or i1 %tobool7, false + br i1 %or.cond, label %for.cond2thread-pre-split, label %lor.rhs + +lor.rhs: + store i64 1, i64* @b, align 8 + br label %for.cond2thread-pre-split + +l1: + %k.2 = phi i64 [ undef, %l1.preheader ], [ 15, %for.cond8.preheader ], [ 5, %for.cond11thread-pre-split.lr.ph ] + store i64 7, i64* %j.3 + br label %for.body + +for.cond16: + %j.0 = phi i64* [ @f, %entry ], [ %j.2, %for.cond20 ], [ @e, %for.cond16 ] + br i1 undef, label %for.cond20, label %for.cond16 + +for.cond20: + %j.2 = phi i64* [ %j.0, %for.cond16 ], [ %j.3, %if.end24 ] + br i1 true, label %if.end24, label %for.cond16 + +if.end24: + %j.3 = phi i64* [ %j.2, %for.cond20 ], [ undef, %entry ] + br i1 false, label %for.cond20, label %l1.preheader + +l1.preheader: + br label %l1 +} diff --git a/test/Transforms/PGOProfile/counter_promo_exit_merge.ll b/test/Transforms/PGOProfile/counter_promo_exit_merge.ll index f53d37600ce61..85ca1613c8ad3 100644 --- a/test/Transforms/PGOProfile/counter_promo_exit_merge.ll +++ b/test/Transforms/PGOProfile/counter_promo_exit_merge.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s -; RUN: opt < %s --passes=instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s +; RUN: opt < %s -instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s +; RUN: opt < %s --passes=instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s $__llvm_profile_raw_version = comdat any diff --git a/test/Transforms/PGOProfile/counter_promo_mexits.ll b/test/Transforms/PGOProfile/counter_promo_mexits.ll index 71e5f066d50f3..bb799757a47cc 100644 --- a/test/Transforms/PGOProfile/counter_promo_mexits.ll +++ b/test/Transforms/PGOProfile/counter_promo_mexits.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s -; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s +; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s +; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s @g = common local_unnamed_addr global i32 0, align 4 diff --git a/test/Transforms/PGOProfile/counter_promo_nest.ll b/test/Transforms/PGOProfile/counter_promo_nest.ll new file mode 100644 index 0000000000000..b7f117b3e9496 --- /dev/null +++ b/test/Transforms/PGOProfile/counter_promo_nest.ll @@ -0,0 +1,165 @@ +; TEST that counter updates are promoted outside the whole loop nest +; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -S | FileCheck --check-prefix=PROMO %s +; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -S | FileCheck --check-prefix=PROMO %s + +@g = common local_unnamed_addr global i32 0, align 4 +@c = local_unnamed_addr global i32 10, align 4 + +; Function Attrs: noinline norecurse nounwind uwtable +define void @bar() local_unnamed_addr #0 { +bb: + %tmp2 = load i32, i32* @g, align 4, !tbaa !2 + %tmp3 = add nsw i32 %tmp2, 1 + store i32 %tmp3, i32* @g, align 4, !tbaa !2 + ret void +} + +; Function Attrs: norecurse nounwind uwtable +define i32 @main() local_unnamed_addr #1 { +bb: + store i32 0, i32* @g, align 4, !tbaa !2 + %tmp = load i32, i32* @c, align 4, !tbaa !2 + %tmp1 = icmp sgt i32 %tmp, 0 + br i1 %tmp1, label %bb2_1, label %bb84 + +bb2_1: + br label %bb2 + +bb2: ; preds = %bb39, %bb + %tmp3 = phi i32 [ %tmp40, %bb39 ], [ %tmp, %bb2_1 ] + %tmp5 = phi i32 [ %tmp43, %bb39 ], [ 0, %bb2_1 ] + %tmp7 = icmp sgt i32 %tmp3, 0 + br i1 %tmp7, label %bb14_1, label %bb39 + +bb8: ; preds = %bb39 +; PROMO-LABEL: bb8 +; PROMO: load {{.*}} @__profc_main{{.*}} +; PROMO-NEXT: add +; PROMO-NEXT: store {{.*}}@__profc_main{{.*}} +; PROMO-NEXT: load {{.*}} @__profc_main{{.*}} +; PROMO-NEXT: add +; PROMO-NEXT: store {{.*}}@__profc_main{{.*}} +; PROMO-NEXT: load {{.*}} @__profc_main{{.*}} +; PROMO-NEXT: add +; PROMO-NEXT: store {{.*}}@__profc_main{{.*}} +; PROMO-NEXT: load {{.*}} @__profc_main{{.*}} +; PROMO-NEXT: add +; PROMO-NEXT: store {{.*}}@__profc_main{{.*}} +; PROMO-NEXT: load {{.*}} @__profc_main{{.*}} +; PROMO-NEXT: add +; PROMO-NEXT: store {{.*}}@__profc_main{{.*}} + + %tmp13 = icmp sgt i32 %tmp40, 0 + br i1 %tmp13, label %bb45, label %bb84 + +bb14_1: + br label %bb14 + +bb14: ; preds = %bb29, %bb2 + %tmp15 = phi i32 [ %tmp30, %bb29 ], [ %tmp3, %bb14_1 ] + %tmp16 = phi i64 [ %tmp31, %bb29 ], [ 0, %bb14_1 ] + %tmp17 = phi i64 [ %tmp32, %bb29 ], [ 0, %bb14_1 ] + %tmp18 = phi i32 [ %tmp33, %bb29 ], [ 0, %bb14_1 ] + %tmp19 = icmp sgt i32 %tmp15, 0 + br i1 %tmp19, label %bb20_split, label %bb29 + +bb20_split: + br label %bb20 + +bb20: ; preds = %bb20, %bb14 + %tmp21 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb20_split ] + %tmp22 = phi i32 [ %tmp24, %bb20 ], [ 0, %bb20_split ] + %tmp23 = add nuw i64 %tmp21, 1 + tail call void @bar() + %tmp24 = add nuw nsw i32 %tmp22, 1 + %tmp25 = load i32, i32* @c, align 4, !tbaa !2 + %tmp26 = icmp slt i32 %tmp24, %tmp25 + br i1 %tmp26, label %bb20, label %bb27 + +bb27: ; preds = %bb20 + %tmp28 = add i64 %tmp23, %tmp16 + br label %bb29 + +bb29: ; preds = %bb27, %bb14 + %tmp30 = phi i32 [ %tmp25, %bb27 ], [ %tmp15, %bb14 ] + %tmp31 = phi i64 [ %tmp28, %bb27 ], [ %tmp16, %bb14 ] + %tmp32 = add nuw i64 %tmp17, 1 + %tmp33 = add nuw nsw i32 %tmp18, 1 + %tmp34 = icmp slt i32 %tmp33, %tmp30 + br i1 %tmp34, label %bb14, label %bb35 + +bb35: ; preds = %bb29 + %tmp36 = insertelement <2 x i64> undef, i64 %tmp31, i32 0 + br label %bb39 + +bb39: ; preds = %bb35, %bb2 + %tmp40 = phi i32 [ %tmp30, %bb35 ], [ %tmp3, %bb2 ] + %tmp43 = add nuw nsw i32 %tmp5, 1 + %tmp44 = icmp slt i32 %tmp43, %tmp40 + br i1 %tmp44, label %bb2, label %bb8 + +bb45: ; preds = %bb67, %bb8 + %tmp46 = phi i32 [ %tmp68, %bb67 ], [ %tmp40, %bb8 ] + %tmp47 = phi i64 [ %tmp69, %bb67 ], [ 0, %bb8 ] + %tmp48 = phi i64 [ %tmp70, %bb67 ], [ 0, %bb8 ] + %tmp49 = phi i32 [ %tmp71, %bb67 ], [ 0, %bb8 ] + %tmp50 = icmp sgt i32 %tmp46, 0 + br i1 %tmp50, label %bb57, label %bb67 + +bb51: ; preds = %bb67 + %tmp56 = icmp sgt i32 %tmp68, 0 + br i1 %tmp56, label %bb73, label %bb84 + +bb57: ; preds = %bb57, %bb45 + %tmp58 = phi i64 [ %tmp60, %bb57 ], [ 0, %bb45 ] + %tmp59 = phi i32 [ %tmp61, %bb57 ], [ 0, %bb45 ] + %tmp60 = add nuw i64 %tmp58, 1 + tail call void @bar() + %tmp61 = add nuw nsw i32 %tmp59, 1 + %tmp62 = load i32, i32* @c, align 4, !tbaa !2 + %tmp63 = mul nsw i32 %tmp62, 10 + %tmp64 = icmp slt i32 %tmp61, %tmp63 + br i1 %tmp64, label %bb57, label %bb65 + +bb65: ; preds = %bb57 + %tmp66 = add i64 %tmp60, %tmp47 + br label %bb67 + +bb67: ; preds = %bb65, %bb45 + %tmp68 = phi i32 [ %tmp62, %bb65 ], [ %tmp46, %bb45 ] + %tmp69 = phi i64 [ %tmp66, %bb65 ], [ %tmp47, %bb45 ] + %tmp70 = add nuw i64 %tmp48, 1 + %tmp71 = add nuw nsw i32 %tmp49, 1 + %tmp72 = icmp slt i32 %tmp71, %tmp68 + br i1 %tmp72, label %bb45, label %bb51 + +bb73: ; preds = %bb73, %bb51 + %tmp74 = phi i64 [ %tmp76, %bb73 ], [ 0, %bb51 ] + %tmp75 = phi i32 [ %tmp77, %bb73 ], [ 0, %bb51 ] + %tmp76 = add nuw i64 %tmp74, 1 + tail call void @bar() + %tmp77 = add nuw nsw i32 %tmp75, 1 + %tmp78 = load i32, i32* @c, align 4, !tbaa !2 + %tmp79 = mul nsw i32 %tmp78, 100 + %tmp80 = icmp slt i32 %tmp77, %tmp79 + br i1 %tmp80, label %bb73, label %bb81 + +bb81: ; preds = %bb73 + br label %bb84 + +bb84: ; preds = %bb81, %bb51, %bb8, %bb + ret i32 0 +} + +attributes #0 = { noinline } +attributes #1 = { norecurse nounwind uwtable } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 307355)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} diff --git a/test/Transforms/SimplifyCFG/implied-and-or.ll b/test/Transforms/SimplifyCFG/implied-and-or.ll new file mode 100644 index 0000000000000..e615f302feefd --- /dev/null +++ b/test/Transforms/SimplifyCFG/implied-and-or.ll @@ -0,0 +1,183 @@ +; RUN: opt %s -S -simplifycfg | FileCheck %s + +declare void @foo() +declare void @bar() + + +; CHECK-LABEL: @test_and1 +; CHECK: taken: +; CHECK-NOT: cmp3 +; CHECK: call void @bar() +; CHECK-NEXT: call void @foo() +; CHECK: ret +define void @test_and1(i32 %a, i32 %b) { +entry: + %cmp1 = icmp eq i32 %a, 0 + %cmp2 = icmp eq i32 %b, 0 + %and = and i1 %cmp1, %cmp2 + br i1 %and, label %taken, label %end + +taken: + call void @bar() + %cmp3 = icmp eq i32 %a, 0 ;; <-- implied true + br i1 %cmp3, label %if.then, label %end + +if.then: + call void @foo() + br label %end + +end: + ret void +} + +; We can't infer anything if the result of the 'and' is false +; CHECK-LABEL: @test_and2 +; CHECK: taken: +; CHECK: call void @bar() +; CHECK: %cmp3 +; CHECK: br i1 %cmp3 +; CHECK: if.then: +; CHECK: call void @foo() +; CHECK: ret +define void @test_and2(i32 %a, i32 %b) { +entry: + %cmp1 = icmp eq i32 %a, 0 + %cmp2 = icmp eq i32 %b, 0 + %and = and i1 %cmp1, %cmp2 + br i1 %and, label %end, label %taken + +taken: + call void @bar() + %cmp3 = icmp eq i32 %a, 0 + br i1 %cmp3, label %if.then, label %end + +if.then: + call void @foo() + br label %end + +end: + ret void +} + +; CHECK-LABEL: @test_or1 +; CHECK: taken: +; CHECK-NOT: cmp3 +; CHECK: call void @bar() +; CHECK-NEXT: call void @foo() +; CHECK: ret +define void @test_or1(i32 %a, i32 %b) { +entry: + %cmp1 = icmp eq i32 %a, 0 + %cmp2 = icmp eq i32 %b, 0 + %or = or i1 %cmp1, %cmp2 + br i1 %or, label %end, label %taken + +taken: + call void @bar() + %cmp3 = icmp ne i32 %a, 0 ;; <-- implied true + br i1 %cmp3, label %if.then, label %end + +if.then: + call void @foo() + br label %end + +end: + ret void +} + +; We can't infer anything if the result of the 'or' is true +; CHECK-LABEL: @test_or2 +; CHECK: call void @bar() +; CHECK: %cmp3 +; CHECK: br i1 %cmp3 +; CHECK: if.then: +; CHECK: call void @foo() +; CHECK: ret +define void @test_or2(i32 %a, i32 %b) { +entry: + %cmp1 = icmp eq i32 %a, 0 + %cmp2 = icmp eq i32 %b, 0 + %or = or i1 %cmp1, %cmp2 + br i1 %or, label %taken, label %end + +taken: + call void @bar() + %cmp3 = icmp eq i32 %a, 0 + br i1 %cmp3, label %if.then, label %end + +if.then: + call void @foo() + br label %end + +end: + ret void +} + +; We can recurse a tree of 'and' or 'or's. +; CHECK-LABEL: @test_and_recurse1 +; CHECK: taken: +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label %end +; CHECK: ret +define void @test_and_recurse1(i32 %a, i32 %b, i32 %c) { +entry: + %cmpa = icmp eq i32 %a, 0 + %cmpb = icmp eq i32 %b, 0 + %cmpc = icmp eq i32 %c, 0 + %and1 = and i1 %cmpa, %cmpb + %and2 = and i1 %and1, %cmpc + br i1 %and2, label %taken, label %end + +taken: + call void @bar() + %cmp3 = icmp eq i32 %a, 0 + br i1 %cmp3, label %if.then, label %end + +if.then: + call void @foo() + br label %end + +end: + ret void +} + +; Check to make sure we don't recurse too deep. +; CHECK-LABEL: @test_and_recurse2 +; CHECK: taken: +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: %cmp3 = icmp eq i32 %a, 0 +; CHECK-NEXT: br i1 %cmp3, label %if.then, label %end +; CHECK: ret +define void @test_and_recurse2(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, + i32 %g, i32 %h) { +entry: + %cmpa = icmp eq i32 %a, 0 + %cmpb = icmp eq i32 %b, 0 + %cmpc = icmp eq i32 %c, 0 + %cmpd = icmp eq i32 %d, 0 + %cmpe = icmp eq i32 %e, 0 + %cmpf = icmp eq i32 %f, 0 + %cmpg = icmp eq i32 %g, 0 + %cmph = icmp eq i32 %h, 0 + %and1 = and i1 %cmpa, %cmpb + %and2 = and i1 %and1, %cmpc + %and3 = and i1 %and2, %cmpd + %and4 = and i1 %and3, %cmpe + %and5 = and i1 %and4, %cmpf + %and6 = and i1 %and5, %cmpg + %and7 = and i1 %and6, %cmph + br i1 %and7, label %taken, label %end + +taken: + call void @bar() + %cmp3 = icmp eq i32 %a, 0 ; <-- can be implied true + br i1 %cmp3, label %if.then, label %end + +if.then: + call void @foo() + br label %end + +end: + ret void +} diff --git a/test/Transforms/SimplifyCFG/sink-common-code.ll b/test/Transforms/SimplifyCFG/sink-common-code.ll index 0f7bfa8516c96..513da477607b9 100644 --- a/test/Transforms/SimplifyCFG/sink-common-code.ll +++ b/test/Transforms/SimplifyCFG/sink-common-code.ll @@ -818,6 +818,30 @@ merge: ; CHECK: right: ; CHECK-NEXT: %val1 = call i32 @call_target() [ "deopt"(i32 20) ] +%T = type {i32, i32} + +define i32 @test_insertvalue(i1 zeroext %flag, %T %P) { +entry: + br i1 %flag, label %if.then, label %if.else + +if.then: + %t1 = insertvalue %T %P, i32 0, 0 + br label %if.end + +if.else: + %t2 = insertvalue %T %P, i32 1, 0 + br label %if.end + +if.end: + %t = phi %T [%t1, %if.then], [%t2, %if.else] + ret i32 1 +} + +; CHECK-LABEL: @test_insertvalue +; CHECK: select +; CHECK: insertvalue +; CHECK-NOT: insertvalue + ; CHECK: ![[TBAA]] = !{![[TYPE:[0-9]]], ![[TYPE]], i64 0} ; CHECK: ![[TYPE]] = !{!"float", ![[TEXT:[0-9]]]} ; CHECK: ![[TEXT]] = !{!"an example type tree"} diff --git a/test/Transforms/Sink/fence.ll b/test/Transforms/Sink/fence.ll index aa237d8192b63..09aa565d88f80 100644 --- a/test/Transforms/Sink/fence.ll +++ b/test/Transforms/Sink/fence.ll @@ -5,9 +5,9 @@ target triple = "x86_64-unknown-linux-gnu" define void @test1(i32* ()*) { entry: %1 = call i32* %0() #0 - fence singlethread seq_cst + fence syncscope("singlethread") seq_cst %2 = load i32, i32* %1, align 4 - fence singlethread seq_cst + fence syncscope("singlethread") seq_cst %3 = icmp eq i32 %2, 0 br i1 %3, label %fail, label %pass @@ -20,9 +20,9 @@ pass: ; preds = %fail, %top ; CHECK-LABEL: @test1( ; CHECK: %[[call:.*]] = call i32* %0() -; CHECK: fence singlethread seq_cst +; CHECK: fence syncscope("singlethread") seq_cst ; CHECK: load i32, i32* %[[call]], align 4 -; CHECK: fence singlethread seq_cst +; CHECK: fence syncscope("singlethread") seq_cst attributes #0 = { nounwind readnone } diff --git a/test/Transforms/ThinLTOBitcodeWriter/pr33536.ll b/test/Transforms/ThinLTOBitcodeWriter/pr33536.ll new file mode 100644 index 0000000000000..661d0739401a7 --- /dev/null +++ b/test/Transforms/ThinLTOBitcodeWriter/pr33536.ll @@ -0,0 +1,37 @@ +; Test for a bug specific to the new pass manager where we may build a domtree +; to make more precise AA queries for functions. +; +; RUN: opt -aa-pipeline=default -passes='no-op-module' -debug-pass-manager -thinlto-bc -o %t %s +; RUN: llvm-modextract -b -n 0 -o - %t | llvm-dis | FileCheck --check-prefix=M0 %s +; RUN: llvm-modextract -b -n 1 -o - %t | llvm-dis | FileCheck --check-prefix=M1 %s + +target triple = "x86_64-unknown-linux-gnu" + +%struct.hoge = type { %struct.widget } +%struct.widget = type { i32 (...)** } + +; M0: @global = local_unnamed_addr global +; M1-NOT: @global +@global = local_unnamed_addr global %struct.hoge { %struct.widget { i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @global.1, i32 0, inrange i32 0, i32 2) to i32 (...)**) } }, align 8 + +; M0: @global.1 = external unnamed_addr constant +; M1: @global.1 = linkonce_odr unnamed_addr constant +@global.1 = linkonce_odr unnamed_addr constant { [3 x i8*] } { [3 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @global.4 to i8*), i8* bitcast (i32 (%struct.widget*)* @quux to i8*)] }, align 8, !type !0 + +; M0: @global.2 = external global +; M1-NOT: @global.2 +@global.2 = external global i8* + +; M0: @global.3 = linkonce_odr constant +; M1-NOT: @global.3 +@global.3 = linkonce_odr constant [22 x i8] c"zzzzzzzzzzzzzzzzzzzzz\00" + +; M0: @global.4 = linkonce_odr constant +; M1: @global.4 = external constant +@global.4 = linkonce_odr constant { i8*, i8* }{ i8* bitcast (i8** getelementptr inbounds (i8*, i8** @global.2, i64 2) to i8*), i8* getelementptr inbounds ([22 x i8], [22 x i8]* @global.3, i32 0, i32 0) } + +@llvm.global_ctors = appending global [0 x { i32, void ()*, i8* }] zeroinitializer + +declare i32 @quux(%struct.widget*) unnamed_addr + +!0 = !{i64 16, !"yyyyyyyyyyyyyyyyyyyyyyyyy"} diff --git a/test/Unit/lit.cfg b/test/Unit/lit.cfg index dac0bf829ba6f..9da82f5f2c9bd 100644 --- a/test/Unit/lit.cfg +++ b/test/Unit/lit.cfg @@ -3,6 +3,7 @@ # Configuration file for the 'lit' test runner. import os +import subprocess import lit.formats @@ -75,8 +76,8 @@ if config.test_exec_root is None: lit_config.fatal('No site specific configuration available!') # Get the source and object roots. - llvm_src_root = lit.util.capture(['llvm-config', '--src-root']).strip() - llvm_obj_root = lit.util.capture(['llvm-config', '--obj-root']).strip() + llvm_src_root = subprocess.check_output(['llvm-config', '--src-root']).strip() + llvm_obj_root = subprocess.check_output(['llvm-config', '--obj-root']).strip() # Validate that we got a tree which points to here. this_src_root = os.path.join(os.path.dirname(__file__),'..','..') diff --git a/test/Verifier/2004-05-21-SwitchConstantMismatch.ll b/test/Verifier/2004-05-21-SwitchConstantMismatch.ll index 339a21cac1907..fea290d74c4ad 100644 --- a/test/Verifier/2004-05-21-SwitchConstantMismatch.ll +++ b/test/Verifier/2004-05-21-SwitchConstantMismatch.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 diff --git a/test/Verifier/2007-12-21-InvokeParamAttrs.ll b/test/Verifier/2007-12-21-InvokeParamAttrs.ll index 709b47b33daa1..c62bc0f4e190f 100644 --- a/test/Verifier/2007-12-21-InvokeParamAttrs.ll +++ b/test/Verifier/2007-12-21-InvokeParamAttrs.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 declare void @foo(i8*) diff --git a/test/Verifier/2008-01-11-VarargAttrs.ll b/test/Verifier/2008-01-11-VarargAttrs.ll index af97ce6474492..d3eb7c72699a3 100644 --- a/test/Verifier/2008-01-11-VarargAttrs.ll +++ b/test/Verifier/2008-01-11-VarargAttrs.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 %struct = type { } diff --git a/test/Verifier/2009-05-29-InvokeResult1.ll b/test/Verifier/2009-05-29-InvokeResult1.ll index bb815b3bfe159..38679f4c49fc1 100644 --- a/test/Verifier/2009-05-29-InvokeResult1.ll +++ b/test/Verifier/2009-05-29-InvokeResult1.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 declare i32 @v() diff --git a/test/Verifier/2009-05-29-InvokeResult2.ll b/test/Verifier/2009-05-29-InvokeResult2.ll index 900b1d827bf45..92a51d71efe65 100644 --- a/test/Verifier/2009-05-29-InvokeResult2.ll +++ b/test/Verifier/2009-05-29-InvokeResult2.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 declare i32 @v() diff --git a/test/Verifier/2009-05-29-InvokeResult3.ll b/test/Verifier/2009-05-29-InvokeResult3.ll index 050de4669d350..3fff219cab7dc 100644 --- a/test/Verifier/2009-05-29-InvokeResult3.ll +++ b/test/Verifier/2009-05-29-InvokeResult3.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 declare i32 @v() diff --git a/test/Verifier/byval-1.ll b/test/Verifier/byval-1.ll index 9bbead0861146..9d09a0ffb1176 100644 --- a/test/Verifier/byval-1.ll +++ b/test/Verifier/byval-1.ll @@ -1,2 +1,2 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 declare void @h(i32 byval %num) diff --git a/test/Verifier/element-wise-atomic-memory-intrinsics.ll b/test/Verifier/element-wise-atomic-memory-intrinsics.ll index 470c861c50573..81c8ba16b97d1 100644 --- a/test/Verifier/element-wise-atomic-memory-intrinsics.ll +++ b/test/Verifier/element-wise-atomic-memory-intrinsics.ll @@ -22,4 +22,46 @@ define void @test_memcpy(i8* %P, i8* %Q, i32 %A, i32 %E) { ret void } declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind + +define void @test_memmove(i8* %P, i8* %Q, i32 %A, i32 %E) { + ; CHECK: element size of the element-wise unordered atomic memory intrinsic must be a constant int + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 1, i32 %E) + ; CHECK: element size of the element-wise atomic memory intrinsic must be a power of 2 + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 1, i32 3) + + ; CHECK: constant length must be a multiple of the element size in the element-wise atomic memory intrinsic + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 7, i32 4) + + ; CHECK: incorrect alignment of the destination argument + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* %P, i8* align 4 %Q, i32 1, i32 1) + ; CHECK: incorrect alignment of the destination argument + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %P, i8* align 4 %Q, i32 4, i32 4) + + ; CHECK: incorrect alignment of the source argument + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* %Q, i32 1, i32 1) + ; CHECK: incorrect alignment of the source argument + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 1 %Q, i32 4, i32 4) + + ret void +} +declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind + +define void @test_memset(i8* %P, i8 %V, i32 %A, i32 %E) { + ; CHECK: element size of the element-wise unordered atomic memory intrinsic must be a constant int + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %P, i8 %V, i32 1, i32 %E) + ; CHECK: element size of the element-wise atomic memory intrinsic must be a power of 2 + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %P, i8 %V, i32 1, i32 3) + + ; CHECK: constant length must be a multiple of the element size in the element-wise atomic memory intrinsic + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 4 %P, i8 %V, i32 7, i32 4) + + ; CHECK: incorrect alignment of the destination argument + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* %P, i8 %V, i32 1, i32 1) + ; CHECK: incorrect alignment of the destination argument + call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %P, i8 %V, i32 4, i32 4) + + ret void +} +declare void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* nocapture, i8, i32, i32) nounwind + ; CHECK: input module is broken! diff --git a/test/Verifier/gcread-ptrptr.ll b/test/Verifier/gcread-ptrptr.ll index 4ed22fa6c24ec..f8b21bfb4c935 100644 --- a/test/Verifier/gcread-ptrptr.ll +++ b/test/Verifier/gcread-ptrptr.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 ; PR1633 %meta = type { i8* } diff --git a/test/Verifier/gcroot-alloca.ll b/test/Verifier/gcroot-alloca.ll index 8caa4b9f58b56..775bde78250e2 100644 --- a/test/Verifier/gcroot-alloca.ll +++ b/test/Verifier/gcroot-alloca.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 ; PR1633 %meta = type { i8* } diff --git a/test/Verifier/gcroot-meta.ll b/test/Verifier/gcroot-meta.ll index 1836f61c7ad6d..26f7b5156294f 100644 --- a/test/Verifier/gcroot-meta.ll +++ b/test/Verifier/gcroot-meta.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 ; PR1633 %meta = type { i8* } diff --git a/test/Verifier/gcroot-ptrptr.ll b/test/Verifier/gcroot-ptrptr.ll index b573295e3e94f..8d7557d75a491 100644 --- a/test/Verifier/gcroot-ptrptr.ll +++ b/test/Verifier/gcroot-ptrptr.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 ; PR1633 %meta = type { i8* } diff --git a/test/Verifier/gcwrite-ptrptr.ll b/test/Verifier/gcwrite-ptrptr.ll index 1f60becc33271..dec1e6bcd3345 100644 --- a/test/Verifier/gcwrite-ptrptr.ll +++ b/test/Verifier/gcwrite-ptrptr.ll @@ -1,4 +1,4 @@ -; RUN: not llvm-as < %s >& /dev/null +; RUN: not llvm-as < %s > /dev/null 2>&1 ; PR1633 %meta = type { i8* } diff --git a/test/lit.cfg b/test/lit.cfg index ed1ba2d11b1a9..8ed9187aea77c 100644 --- a/test/lit.cfg +++ b/test/lit.cfg @@ -6,6 +6,7 @@ import os import sys import re import platform +import subprocess import lit.util import lit.formats @@ -150,8 +151,8 @@ if config.test_exec_root is None: lit_config.fatal('No site specific configuration available!') # Get the source and object roots. - llvm_src_root = lit.util.capture(['llvm-config', '--src-root']).strip() - llvm_obj_root = lit.util.capture(['llvm-config', '--obj-root']).strip() + llvm_src_root = subprocess.check_output(['llvm-config', '--src-root']).strip() + llvm_obj_root = subprocess.check_output(['llvm-config', '--obj-root']).strip() # Validate that we got a tree which points to here. this_src_root = os.path.dirname(config.test_source_root) diff --git a/test/tools/llvm-cov/threads.c b/test/tools/llvm-cov/threads.c new file mode 100644 index 0000000000000..00a85edb7ce8b --- /dev/null +++ b/test/tools/llvm-cov/threads.c @@ -0,0 +1,11 @@ +// Coverage/profile data recycled from the showLineExecutionCounts.cpp test. +// +// RUN: llvm-profdata merge %S/Inputs/lineExecutionCounts.proftext -o %t.profdata +// RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -j 1 -o %t1.dir -instr-profile %t.profdata -filename-equivalence %S/showLineExecutionCounts.cpp +// RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -num-threads 2 -o %t2.dir -instr-profile %t.profdata -filename-equivalence %S/showLineExecutionCounts.cpp +// RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -o %t3.dir -instr-profile %t.profdata -filename-equivalence %S/showLineExecutionCounts.cpp +// +// RUN: diff %t1.dir/index.txt %t2.dir/index.txt +// RUN: diff %t1.dir/coverage/tmp/showLineExecutionCounts.cpp.txt %t2.dir/coverage/tmp/showLineExecutionCounts.cpp.txt +// RUN: diff %t1.dir/index.txt %t3.dir/index.txt +// RUN: diff %t1.dir/coverage/tmp/showLineExecutionCounts.cpp.txt %t3.dir/coverage/tmp/showLineExecutionCounts.cpp.txt diff --git a/test/tools/llvm-cov/zeroFunctionFile.c b/test/tools/llvm-cov/zeroFunctionFile.c index 87b6ecd3abb3e..d5b983efb8179 100644 --- a/test/tools/llvm-cov/zeroFunctionFile.c +++ b/test/tools/llvm-cov/zeroFunctionFile.c @@ -13,7 +13,7 @@ int main() { // REPORT: 0 0 - 0 0 - 0 0 - 0 0 - // REPORT-NO: 0% -// RUN: llvm-cov show %S/Inputs/zeroFunctionFile.covmapping -format html -instr-profile %t.profdata -o %t.dir +// RUN: llvm-cov show -j 1 %S/Inputs/zeroFunctionFile.covmapping -format html -instr-profile %t.profdata -o %t.dir // RUN: FileCheck %s -input-file=%t.dir/index.html -check-prefix=HTML // HTML: <td class='column-entry-green'><pre>- (0/0) // HTML-NO: 0.00% (0/0) diff --git a/test/tools/llvm-objdump/ARM/Inputs/reloc-half.obj.macho-arm b/test/tools/llvm-objdump/ARM/Inputs/reloc-half.obj.macho-arm Binary files differnew file mode 100644 index 0000000000000..79d19962e00b0 --- /dev/null +++ b/test/tools/llvm-objdump/ARM/Inputs/reloc-half.obj.macho-arm diff --git a/test/tools/llvm-objdump/ARM/macho-reloc-half.test b/test/tools/llvm-objdump/ARM/macho-reloc-half.test new file mode 100644 index 0000000000000..888c7f5891168 --- /dev/null +++ b/test/tools/llvm-objdump/ARM/macho-reloc-half.test @@ -0,0 +1,4 @@ +RUN: llvm-objdump -r %p/Inputs/reloc-half.obj.macho-arm | FileCheck %s + +CHECK-DAG: 00000004 ARM_RELOC_HALF :upper16:(_stringbuf) +CHECK-DAG: 00000000 ARM_RELOC_HALF :lower16:(_stringbuf) diff --git a/test/tools/llvm-objdump/Inputs/test.wasm b/test/tools/llvm-objdump/Inputs/test.wasm Binary files differdeleted file mode 100644 index d3906eeaf6f86..0000000000000 --- a/test/tools/llvm-objdump/Inputs/test.wasm +++ /dev/null diff --git a/test/tools/llvm-objdump/Inputs/trivial.ll b/test/tools/llvm-objdump/Inputs/trivial.ll new file mode 100644 index 0000000000000..6dd510a12b66b --- /dev/null +++ b/test/tools/llvm-objdump/Inputs/trivial.ll @@ -0,0 +1,19 @@ +; Input used for generating checked-in binaries (trivial.obj.*) +; llc -mtriple=wasm32-unknown-unknown-wasm trivial.ll -filetype=obj -o trivial.obj.wasm + +@.str = private unnamed_addr constant [13 x i8] c"Hello World\0A\00", align 1 + +define i32 @main() nounwind { +entry: + %call = tail call i32 @puts(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0)) nounwind + tail call void bitcast (void (...)* @SomeOtherFunction to void ()*)() nounwind + ret i32 0 +} + +declare i32 @puts(i8* nocapture) nounwind + +declare void @SomeOtherFunction(...) + +@var = global i32 0 +@llvm.used = appending global [1 x i8*] [i8* bitcast (i32* @var to i8*)], section "llvm.metadata" +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* null, i8* null }] diff --git a/test/tools/llvm-objdump/Inputs/trivial.obj.wasm b/test/tools/llvm-objdump/Inputs/trivial.obj.wasm Binary files differnew file mode 100644 index 0000000000000..1f3947ac472e0 --- /dev/null +++ b/test/tools/llvm-objdump/Inputs/trivial.obj.wasm diff --git a/test/tools/llvm-objdump/WebAssembly/symbol-table.test b/test/tools/llvm-objdump/WebAssembly/symbol-table.test index 8936c7a12e4c6..2c49d5d65c5d7 100644 --- a/test/tools/llvm-objdump/WebAssembly/symbol-table.test +++ b/test/tools/llvm-objdump/WebAssembly/symbol-table.test @@ -1,8 +1,11 @@ -RUN: llvm-objdump -t %p/../Inputs/test.wasm | FileCheck %s +RUN: llvm-objdump -t %p/../Inputs/trivial.obj.wasm | FileCheck %s + +CHECK: SYMBOL TABLE: +CHECK-NEXT: 00000000 l F IMPORT puts +CHECK-NEXT: 00000000 l F IMPORT SomeOtherFunction +CHECK-NEXT: 00000002 g F EXPORT main +CHECK-NEXT: 00000001 g EXPORT var +CHECK-NEXT: 00000000 l F name puts +CHECK-NEXT: 00000001 l F name SomeOtherFunction +CHECK-NEXT: 00000002 l F name main -CHECK: SYMBOL TABLE: -CHECK: 00000000 l F IMPORT bar -CHECK: 00000000 g F EXPORT baz -CHECK: 00000001 g F EXPORT quux -CHECK: 00000000 l F name $import -CHECK: 00000001 l F name $func0 diff --git a/test/tools/llvm-objdump/wasm.txt b/test/tools/llvm-objdump/wasm.txt index 4aa40c6c9df8c..828fa34b2b46f 100644 --- a/test/tools/llvm-objdump/wasm.txt +++ b/test/tools/llvm-objdump/wasm.txt @@ -1,24 +1,27 @@ -# RUN: llvm-objdump -h %p/Inputs/test.wasm | FileCheck %s +# RUN: llvm-objdump -h %p/Inputs/trivial.obj.wasm | FileCheck %s -# CHECK: Sections: -# CHECK: Idx Name Size Address Type -# CHECK: 0 TYPE 0000000f 0000000000000000 -# CHECK: 1 IMPORT 0000000b 0000000000000000 -# CHECK: 2 FUNCTION 00000003 0000000000000000 -# CHECK: 3 TABLE 00000005 0000000000000000 -# CHECK: 4 EXPORT 0000000e 0000000000000000 -# CHECK: 5 ELEM 00000007 0000000000000000 -# CHECK: 6 CODE 0000002a 0000000000000000 TEXT -# CHECK: 7 name 0000003c 0000000000000000 +# CHECK: Sections: +# CHECK-NEXT: Idx Name Size Address Type +# CHECK-NEXT: 0 TYPE 0000000e 0000000000000000 +# CHECK-NEXT: 1 IMPORT 00000024 0000000000000000 +# CHECK-NEXT: 2 FUNCTION 00000002 0000000000000000 +# CHECK-NEXT: 3 TABLE 00000004 0000000000000000 +# CHECK-NEXT: 4 MEMORY 00000003 0000000000000000 +# CHECK-NEXT: 5 GLOBAL 0000000b 0000000000000000 +# CHECK-NEXT: 6 EXPORT 0000000e 0000000000000000 +# CHECK-NEXT: 7 CODE 00000019 0000000000000000 TEXT +# CHECK-NEXT: 8 DATA 0000001a 0000000000000000 DATA +# CHECK-NEXT: 9 name 0000002b 0000000000000000 +# CHECK-NEXT: 10 reloc.CODE 00000017 0000000000000000 +# CHECK-NEXT: 11 linking 00000016 0000000000000000 -# RUN: llvm-objdump -p %p/Inputs/test.wasm | FileCheck %s -check-prefix CHECK-HEADER +# RUN: llvm-objdump -p %p/Inputs/trivial.obj.wasm | FileCheck %s -check-prefix CHECK-HEADER # CHECK-HEADER: Program Header: # CHECK-HEADER: Version: 0x1 -# RUN: llvm-objdump -s --section=CODE %p/Inputs/test.wasm | FileCheck %s -check-prefix CHECK-SECTIONS +# RUN: llvm-objdump -s --section=CODE %p/Inputs/trivial.obj.wasm | FileCheck %s -check-prefix CHECK-SECTIONS # CHECK-SECTIONS: Contents of section CODE: -# CHECK-SECTIONS: 0000 02070043 0000803f 0b200201 7d017c10 ...C...?. ..}.|. -# CHECK-SECTIONS: 0010 001a4100 10011a41 00410111 00001a20 ..A....A.A..... -# CHECK-SECTIONS: 0020 011a4300 00000021 020b ..C....!.. +# CHECK-SECTIONS: 0000 01170041 80808080 00108080 8080001a ...A............ +# CHECK-SECTIONS: 0010 10818080 80004100 0b ......A.. diff --git a/test/tools/llvm-pdbdump/partial-type-stream.test b/test/tools/llvm-pdbdump/partial-type-stream.test index 3a853c3914506..7c62acce7ad4b 100644 --- a/test/tools/llvm-pdbdump/partial-type-stream.test +++ b/test/tools/llvm-pdbdump/partial-type-stream.test @@ -17,8 +17,7 @@ DEPS: Types (TPI Stream) DEPS-NEXT: ============================================================ DEPS-NEXT: Showing 1 records and their dependents (4 records total) DEPS-NEXT: 0x100E | LF_ARGLIST [size = 8] -DEPS-NEXT: 0x1017 | LF_CLASS [size = 60] -DEPS-NEXT: class name: `MembersTest::A` +DEPS-NEXT: 0x1017 | LF_CLASS [size = 60] `MembersTest::A` DEPS-NEXT: unique name: `.?AVA@MembersTest@@` DEPS-NEXT: vtable: <no type>, base list: <no type>, field list: <no type> DEPS-NEXT: options: forward ref | has unique name diff --git a/test/tools/llvm-profdata/c-general.test b/test/tools/llvm-profdata/c-general.test index 0ec7c113eb4c6..ddb95d1260d88 100644 --- a/test/tools/llvm-profdata/c-general.test +++ b/test/tools/llvm-profdata/c-general.test @@ -10,6 +10,7 @@ REGENERATE: $ clang -o a.out -fprofile-instr-generate $CFE_TESTDIR/c-general.c REGENERATE: $ LLVM_PROFILE_FILE=$TESTDIR/Inputs/c-general.profraw ./a.out RUN: llvm-profdata show %p/Inputs/c-general.profraw -o - | FileCheck %s +RUN: llvm-profdata show %p/Inputs/c-general.profraw --topn=3 -o - | FileCheck %s --check-prefix=TOPN RUN: llvm-profdata show %p/Inputs/c-general.profraw -o - --function=switches | FileCheck %s -check-prefix=SWITCHES -check-prefix=CHECK SWITCHES-LABEL: Counters: @@ -22,3 +23,6 @@ SWITCHES-LABEL: Functions shown: 1 CHECK-LABEL: Total functions: 12 CHECK-NEXT: Maximum function count: 1 CHECK-NEXT: Maximum internal block count: 100 +TOPN: boolean_operators, max count = 100 +TOPN-NEXT: simple_loops, max count = 100 +TOPN-NEXT: conditionals, max count = 100 diff --git a/test/tools/llvm-readobj/Inputs/trivial.ll b/test/tools/llvm-readobj/Inputs/trivial.ll index f79b8b897691c..e0e519d064dea 100644 --- a/test/tools/llvm-readobj/Inputs/trivial.ll +++ b/test/tools/llvm-readobj/Inputs/trivial.ll @@ -1,9 +1,11 @@ -; llc -mtriple=i386-pc-win32 trivial.ll -filetype=obj -o trivial-object-test.coff-i386 -; llc -mtriple=x86_64-pc-win32 trivial.ll -filetype=obj -o trivial-object-test.coff-x86-64 -; llc -mtriple=i386-linux-gnu trivial.ll -filetype=obj -o trivial-object-test.elf-i386 -relocation-model=pic -; llc -mtriple=x86_64-linux-gnu trivial.ll -filetype=obj -o trivial-object-test.elf-x86-64 -relocation-model=pic -; llc -mtriple=i386-apple-darwin10 trivial.ll -filetype=obj -o trivial-object-test.macho-i386 -relocation-model=pic -; llc -mtriple=x86_64-apple-darwin10 trivial.ll -filetype=obj -o trivial-object-test.macho-x86-64 -relocation-model=pic +; Input used for generating checked-in binaries (trivial.obj.*) +; llc -mtriple=i386-pc-win32 trivial.ll -filetype=obj -o trivial.obj.coff-i386 +; llc -mtriple=x86_64-pc-win32 trivial.ll -filetype=obj -o trivial.obj.coff-x86-64 +; llc -mtriple=i386-linux-gnu trivial.ll -filetype=obj -o trivial.obj.elf-i386 -relocation-model=pic +; llc -mtriple=x86_64-linux-gnu trivial.ll -filetype=obj -o trivial.obj.elf-x86-64 -relocation-model=pic +; llc -mtriple=i386-apple-darwin10 trivial.ll -filetype=obj -o trivial.obj.macho-i386 -relocation-model=pic +; llc -mtriple=x86_64-apple-darwin10 trivial.ll -filetype=obj -o trivial.obj.macho-x86-64 -relocation-model=pic +; llc -mtriple=wasm32-unknown-unknown-wasm trivial.ll -filetype=obj -o trivial.obj.wasm @.str = private unnamed_addr constant [13 x i8] c"Hello World\0A\00", align 1 diff --git a/test/tools/llvm-readobj/Inputs/trivial.obj.wasm b/test/tools/llvm-readobj/Inputs/trivial.obj.wasm Binary files differindex f14192f1798b0..caa702f700153 100644 --- a/test/tools/llvm-readobj/Inputs/trivial.obj.wasm +++ b/test/tools/llvm-readobj/Inputs/trivial.obj.wasm diff --git a/test/tools/llvm-readobj/codeview-linetables.test b/test/tools/llvm-readobj/codeview-linetables.test index fe68e7efdb056..9256aefe4330b 100644 --- a/test/tools/llvm-readobj/codeview-linetables.test +++ b/test/tools/llvm-readobj/codeview-linetables.test @@ -41,7 +41,7 @@ MFUN32: ] MFUN32: Subsection [ MFUN32-NEXT: SubSectionType: Symbols (0xF1) MFUN32-NEXT: SubSectionSize: 0x4B -MFUN32: ProcStart { +MFUN32: GlobalProcIdSym { MFUN32: CodeSize: 0xA MFUN32: DisplayName: x MFUN32: LinkageName: _x @@ -60,7 +60,7 @@ MFUN32: ] MFUN32: Subsection [ MFUN32-NEXT: SubSectionType: Symbols (0xF1) MFUN32-NEXT: SubSectionSize: 0x4B -MFUN32: ProcStart { +MFUN32: GlobalProcIdSym { MFUN32: CodeSize: 0xA MFUN32: DisplayName: y MFUN32: LinkageName: _y @@ -79,7 +79,7 @@ MFUN32: ] MFUN32: Subsection [ MFUN32-NEXT: SubSectionType: Symbols (0xF1) MFUN32-NEXT: SubSectionSize: 0x4B -MFUN32: ProcStart { +MFUN32: GlobalProcIdSym { MFUN32: CodeSize: 0x14 MFUN32: DisplayName: f MFUN32: LinkageName: _f @@ -193,7 +193,7 @@ MFUN64: ] MFUN64: Subsection [ MFUN64-NEXT: SubSectionType: Symbols (0xF1) MFUN64-NEXT: SubSectionSize: 0x4B -MFUN64: ProcStart { +MFUN64: GlobalProcIdSym { MFUN64: CodeSize: 0xE MFUN64: DisplayName: x MFUN64: LinkageName: x @@ -208,7 +208,7 @@ MFUN64-NEXT: ] MFUN64-NEXT: Subsection [ MFUN64-NEXT: SubSectionType: Symbols (0xF1) MFUN64-NEXT: SubSectionSize: 0x4B -MFUN64: ProcStart { +MFUN64: GlobalProcIdSym { MFUN64: CodeSize: 0xE MFUN64: DisplayName: y MFUN64: LinkageName: y @@ -223,7 +223,7 @@ MFUN64-NEXT: ] MFUN64-NEXT: Subsection [ MFUN64-NEXT: SubSectionType: Symbols (0xF1) MFUN64-NEXT: SubSectionSize: 0x4B -MFUN64: ProcStart { +MFUN64: GlobalProcIdSym { MFUN64: CodeSize: 0x18 MFUN64: DisplayName: f MFUN64: LinkageName: f @@ -365,7 +365,7 @@ MFILE32: ] MFILE32: Subsection [ MFILE32-NEXT: SubSectionType: Symbols (0xF1) MFILE32-NEXT: SubSectionSize: 0x4B -MFILE32: ProcStart { +MFILE32: GlobalProcIdSym { MFILE32: CodeSize: 0x14 MFILE32: DisplayName: f MFILE32: LinkageName: _f @@ -442,7 +442,7 @@ MFILE64: ] MFILE64: Subsection [ MFILE64-NEXT: SubSectionType: Symbols (0xF1) MFILE64-NEXT: SubSectionSize: 0x4B -MFILE64: ProcStart { +MFILE64: GlobalProcIdSym { MFILE64: CodeSize: 0x18 MFILE64: DisplayName: f MFILE64: LinkageName: f @@ -528,7 +528,7 @@ RUN: | FileCheck %s -check-prefix MCOMDAT RUN: llvm-readobj -s -codeview -section-symbols %p/Inputs/comdat-function-linetables.obj.coff-2013-i386 \ RUN: | FileCheck %s -check-prefix MCOMDAT -MCOMDAT: ProcStart { +MCOMDAT: GlobalProcIdSym { MCOMDAT: CodeSize: 0x7 MCOMDAT: DisplayName: f MCOMDAT: LinkageName: ?f@@YAHXZ @@ -556,7 +556,7 @@ MCOMDAT-NEXT: IsStatement: Yes MCOMDAT-NEXT: ] MCOMDAT-NEXT: ] MCOMDAT-NEXT: ] -MCOMDAT: ProcStart { +MCOMDAT: GlobalProcIdSym { MCOMDAT: CodeSize: 0x7 MCOMDAT: DisplayName: g MCOMDAT: LinkageName: ?g@@YAHXZ diff --git a/test/tools/llvm-readobj/file-headers.test b/test/tools/llvm-readobj/file-headers.test index 6bc9714f2037e..65ccd50a27294 100644 --- a/test/tools/llvm-readobj/file-headers.test +++ b/test/tools/llvm-readobj/file-headers.test @@ -28,9 +28,6 @@ RUN: llvm-readobj -h %p/Inputs/magic.coff-importlib \ RUN: | FileCheck %s -check-prefix COFF-IMPORTLIB RUN: llvm-readobj -h %p/Inputs/trivial.obj.elf-lanai \ RUN: | FileCheck %s -check-prefix ELF-LANAI -# trivial.obj.wasm was generated using the following command: -# echo "extern int bar, baz; int foo() { return bar + baz + (int)&foo; }" | \ -# ./bin/clang -c -o trivial.obj.wasm -target wasm32-unknown-unknown-wasm -x c - RUN: llvm-readobj -h %p/Inputs/trivial.obj.wasm \ RUN: | FileCheck %s -check-prefix WASM diff --git a/test/tools/llvm-readobj/relocations.test b/test/tools/llvm-readobj/relocations.test index 9c7dcf1d659c0..85ccd3cefa1b9 100644 --- a/test/tools/llvm-readobj/relocations.test +++ b/test/tools/llvm-readobj/relocations.test @@ -289,21 +289,20 @@ MACHO-ARM-NEXT: ] WASM: Relocations [ WASM-NEXT: Section (8) CODE { WASM-NEXT: Relocation { -WASM-NEXT: Type: R_WEBASSEMBLY_TABLE_INDEX_SLEB (1) -WASM-NEXT: Offset: 0x6 +WASM-NEXT: Type: R_WEBASSEMBLY_GLOBAL_ADDR_SLEB (4) +WASM-NEXT: Offset: 0x4 WASM-NEXT: Index: 0x0 +WASM-NEXT: Addend: 0 WASM-NEXT: } WASM-NEXT: Relocation { -WASM-NEXT: Type: R_WEBASSEMBLY_GLOBAL_ADDR_LEB (3) -WASM-NEXT: Offset: 0x15 +WASM-NEXT: Type: R_WEBASSEMBLY_FUNCTION_INDEX_LEB (0) +WASM-NEXT: Offset: 0xA WASM-NEXT: Index: 0x0 -WASM-NEXT: Addend: 0 WASM-NEXT: } WASM-NEXT: Relocation { -WASM-NEXT: Type: R_WEBASSEMBLY_GLOBAL_ADDR_LEB (3) -WASM-NEXT: Offset: 0x24 +WASM-NEXT: Type: R_WEBASSEMBLY_FUNCTION_INDEX_LEB (0) +WASM-NEXT: Offset: 0x11 WASM-NEXT: Index: 0x1 -WASM-NEXT: Addend: 0 WASM-NEXT: } WASM-NEXT: } WASM-NEXT: ] diff --git a/test/tools/llvm-readobj/sections.test b/test/tools/llvm-readobj/sections.test index 1747ee45d4f3c..4eda5dae882ac 100644 --- a/test/tools/llvm-readobj/sections.test +++ b/test/tools/llvm-readobj/sections.test @@ -493,62 +493,75 @@ MACHO-ARM-NEXT: Reserved2: 0x0 MACHO-ARM-NEXT: } MACHO-ARM-NEXT:] -WASM: Sections [ -WASM-NEXT: Section { -WASM-NEXT: Type: TYPE (0x1) -WASM-NEXT: Size: 5 -WASM-NEXT: Offset: 8 -WASM-NEXT: } -WASM-NEXT: Section { -WASM-NEXT: Type: IMPORT (0x2) -WASM-NEXT: Size: 23 -WASM-NEXT: Offset: 19 -WASM-NEXT: } -WASM-NEXT: Section { -WASM-NEXT: Type: FUNCTION (0x3) -WASM-NEXT: Size: 2 -WASM-NEXT: Offset: 48 -WASM-NEXT: } -WASM-NEXT: Section { -WASM-NEXT: Type: TABLE (0x4) -WASM-NEXT: Size: 4 -WASM-NEXT: Offset: 56 -WASM-NEXT: } -WASM-NEXT: Section { -WASM-NEXT: Type: MEMORY (0x5) -WASM-NEXT: Size: 3 -WASM-NEXT: Offset: 66 -WASM-NEXT: Memories [ -WASM-NEXT: Memory { -WASM-NEXT: InitialPages: 0 -WASM-NEXT: } -WASM-NEXT: ] -WASM-NEXT: } -WASM-NEXT: Section { -WASM-NEXT: Type: EXPORT (0x7) -WASM-NEXT: Size: 7 -WASM-NEXT: Offset: 75 -WASM-NEXT: } -WASM-NEXT: Section { -WASM-NEXT: Type: ELEM (0x9) -WASM-NEXT: Size: 7 -WASM-NEXT: Offset: 88 -WASM-NEXT: } -WASM-NEXT: Section { -WASM-NEXT: Type: CODE (0xA) -WASM-NEXT: Size: 61 -WASM-NEXT: Offset: 101 -WASM-NEXT: } -WASM-NEXT: Section { -WASM-NEXT: Type: CUSTOM (0x0) -WASM-NEXT: Size: 17 -WASM-NEXT: Offset: 168 -WASM-NEXT: Name: name -WASM-NEXT: } -WASM-NEXT: Section { -WASM-NEXT: Type: CUSTOM (0x0) -WASM-NEXT: Size: 24 -WASM-NEXT: Offset: 191 -WASM-NEXT: Name: reloc.CODE -WASM-NEXT: } -WASM-NEXT:] +WASM: Sections [ +WASM-NEXT: Section { +WASM-NEXT: Type: TYPE (0x1) +WASM-NEXT: Size: 14 +WASM-NEXT: Offset: 8 +WASM-NEXT: } +WASM-NEXT: Section { +WASM-NEXT: Type: IMPORT (0x2) +WASM-NEXT: Size: 36 +WASM-NEXT: Offset: 28 +WASM-NEXT: } +WASM-NEXT: Section { +WASM-NEXT: Type: FUNCTION (0x3) +WASM-NEXT: Size: 2 +WASM-NEXT: Offset: 70 +WASM-NEXT: } +WASM-NEXT: Section { +WASM-NEXT: Type: TABLE (0x4) +WASM-NEXT: Size: 4 +WASM-NEXT: Offset: 78 +WASM-NEXT: } +WASM-NEXT: Section { +WASM-NEXT: Type: MEMORY (0x5) +WASM-NEXT: Size: 3 +WASM-NEXT: Offset: 88 +WASM-NEXT: Memories [ +WASM-NEXT: Memory { +WASM-NEXT: InitialPages: 1 +WASM-NEXT: } +WASM-NEXT: ] +WASM-NEXT: } +WASM-NEXT: Section { +WASM-NEXT: Type: GLOBAL (0x6) +WASM-NEXT: Size: 6 +WASM-NEXT: Offset: 97 +WASM-NEXT: } +WASM-NEXT: Section { +WASM-NEXT: Type: EXPORT (0x7) +WASM-NEXT: Size: 8 +WASM-NEXT: Offset: 109 +WASM-NEXT: } +WASM-NEXT: Section { +WASM-NEXT: Type: CODE (0xA) +WASM-NEXT: Size: 25 +WASM-NEXT: Offset: 123 +WASM-NEXT: } +WASM-NEXT: Section { +WASM-NEXT: Type: DATA (0xB) +WASM-NEXT: Size: 19 +WASM-NEXT: Offset: 154 +WASM-NEXT: } +WASM-NEXT: Section { +WASM-NEXT: Type: CUSTOM (0x0) +WASM-NEXT: Size: 43 +WASM-NEXT: Offset: 179 +WASM-NEXT: Name: name +WASM-NEXT: } +WASM-NEXT: Section { +WASM-NEXT: Type: CUSTOM (0x0) +WASM-NEXT: Size: 23 +WASM-NEXT: Offset: 228 +WASM-NEXT: Name: reloc.CODE +WASM-NEXT: } +WASM-NEXT: Section { +WASM-NEXT: Type: CUSTOM (0x0) +WASM-NEXT: Size: 22 +WASM-NEXT: Offset: 257 +WASM-NEXT: Name: linking +WASM-NEXT: DataSize: 13 +WASM-NEXT: DataAlignment: 1 +WASM-NEXT: } +WASM-NEXT: ] diff --git a/test/tools/llvm-readobj/symbols.test b/test/tools/llvm-readobj/symbols.test index da8a70b031aba..380c6f6a5ee50 100644 --- a/test/tools/llvm-readobj/symbols.test +++ b/test/tools/llvm-readobj/symbols.test @@ -73,22 +73,32 @@ ELF-NEXT: } WASM: Symbols [ WASM-NEXT: Symbol { -WASM-NEXT: Name: bar -WASM-NEXT: Type: GLOBAL_IMPORT (0x2) +WASM-NEXT: Name: puts +WASM-NEXT: Type: FUNCTION_IMPORT (0x0) WASM-NEXT: Flags: 0x0 WASM-NEXT: } WASM-NEXT: Symbol { -WASM-NEXT: Name: baz -WASM-NEXT: Type: GLOBAL_IMPORT (0x2) +WASM-NEXT: Name: SomeOtherFunction +WASM-NEXT: Type: FUNCTION_IMPORT (0x0) WASM-NEXT: Flags: 0x0 WASM-NEXT: } WASM-NEXT: Symbol { -WASM-NEXT: Name: foo +WASM-NEXT: Name: main WASM-NEXT: Type: FUNCTION_EXPORT (0x1) WASM-NEXT: Flags: 0x0 WASM-NEXT: } WASM-NEXT: Symbol { -WASM-NEXT: Name: foo +WASM-NEXT: Name: puts +WASM-NEXT: Type: DEBUG_FUNCTION_NAME (0x4) +WASM-NEXT: Flags: 0x0 +WASM-NEXT: } +WASM-NEXT: Symbol { +WASM-NEXT: Name: SomeOtherFunction +WASM-NEXT: Type: DEBUG_FUNCTION_NAME (0x4) +WASM-NEXT: Flags: 0x0 +WASM-NEXT: } +WASM-NEXT: Symbol { +WASM-NEXT: Name: main WASM-NEXT: Type: DEBUG_FUNCTION_NAME (0x4) WASM-NEXT: Flags: 0x0 WASM-NEXT: } |