diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-06-01 20:58:36 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-06-01 20:58:36 +0000 |
commit | f382538d471e38a9b98f016c4caebd24c8d60b62 (patch) | |
tree | d30f3d58b1044b5355d50c17a6a96c6a0b35703a /test | |
parent | ee2f195dd3e40f49698ca4dc2666ec09c770e80d (diff) |
Diffstat (limited to 'test')
125 files changed, 5980 insertions, 6359 deletions
diff --git a/test/Analysis/CFLAliasAnalysis/Andersen/struct.ll b/test/Analysis/CFLAliasAnalysis/Andersen/struct.ll new file mode 100644 index 0000000000000..c1d25c1e3c214 --- /dev/null +++ b/test/Analysis/CFLAliasAnalysis/Andersen/struct.ll @@ -0,0 +1,18 @@ +; Ensures that our struct ops are sane. + +; RUN: opt < %s -disable-basicaa -cfl-anders-aa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -aa-pipeline=cfl-anders-aa -passes=aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s + +; Since we ignore non-pointer values, we effectively ignore extractvalue +; instructions. This means that %c "doesn't exist" in test_structure's graph, +; so we currently get MayAlias. +; XFAIL: * + +; CHECK-LABEL: Function: test_structure +; CHECK: NoAlias: i64** %c, { i64**, i64** }* %a +define void @test_structure() { + %a = alloca {i64**, i64**}, align 8 + %b = load {i64**, i64**}, {i64**, i64**}* %a + %c = extractvalue {i64**, i64**} %b, 0 + ret void +} diff --git a/test/Bitcode/thinlto-function-summary-callgraph.ll b/test/Bitcode/thinlto-function-summary-callgraph.ll index 8cc60ad633621..566f3a077e7bf 100644 --- a/test/Bitcode/thinlto-function-summary-callgraph.ll +++ b/test/Bitcode/thinlto-function-summary-callgraph.ll @@ -11,20 +11,23 @@ ; RUN: llvm-lto -thinlto-index-stats %p/Inputs/thinlto-function-summary-callgraph-combined.1.bc | FileCheck %s --check-prefix=OLD-COMBINED ; CHECK: <SOURCE_FILENAME +; CHECK-NEXT: <GLOBALVAR ; CHECK-NEXT: <FUNCTION ; "func" -; CHECK-NEXT: <FUNCTION op0=4 op1=4 +; CHECK-NEXT: <FUNCTION op0=17 op1=4 ; CHECK: <GLOBALVAL_SUMMARY_BLOCK ; CHECK-NEXT: <VERSION ; See if the call to func is registered. -; CHECK-NEXT: <PERMODULE {{.*}} op4=1/> +; CHECK-NEXT: <PERMODULE {{.*}} op3=1 ; CHECK-NEXT: </GLOBALVAL_SUMMARY_BLOCK> ; CHECK: <STRTAB_BLOCK -; CHECK-NEXT: blob data = 'mainfunc' +; CHECK-NEXT: blob data = 'undefinedglobmainfunc' ; COMBINED: <GLOBALVAL_SUMMARY_BLOCK ; COMBINED-NEXT: <VERSION +; Only 2 VALUE_GUID since reference to undefinedglob should not be included in +; combined index. ; COMBINED-NEXT: <VALUE_GUID op0=[[FUNCID:[0-9]+]] op1=7289175272376759421/> ; COMBINED-NEXT: <VALUE_GUID ; COMBINED-NEXT: <COMBINED @@ -40,10 +43,12 @@ target triple = "x86_64-unknown-linux-gnu" define i32 @main() #0 { entry: call void (...) @func() - ret i32 0 + %u = load i32, i32* @undefinedglob + ret i32 %u } declare void @func(...) #1 +@undefinedglob = external global i32 ; OLD: Index {{.*}} contains 1 nodes (1 functions, 0 alias, 0 globals) and 1 edges (0 refs and 1 calls) ; OLD-COMBINED: Index {{.*}} contains 2 nodes (2 functions, 0 alias, 0 globals) and 1 edges (0 refs and 1 calls) diff --git a/test/CodeGen/AArch64/GlobalISel/localizer.mir b/test/CodeGen/AArch64/GlobalISel/localizer.mir index 8fbb2040157e7..5bf8dac79860c 100644 --- a/test/CodeGen/AArch64/GlobalISel/localizer.mir +++ b/test/CodeGen/AArch64/GlobalISel/localizer.mir @@ -12,6 +12,7 @@ define void @non_local_phi_use_followed_by_use() { ret void } define void @non_local_phi_use_followed_by_use_fi() { ret void } define void @float_non_local_phi_use_followed_by_use_fi() { ret void } + define void @non_local_phi() { ret void } ... --- @@ -310,3 +311,51 @@ body: | %3(s32) = PHI %0(s32), %bb.1 %2(s32) = G_FADD %3, %0 ... + +--- +# Make sure we don't insert a constant before PHIs. +# This used to happen for loops of one basic block. +# CHECK-LABEL: name: non_local_phi +name: non_local_phi +legalized: true +regBankSelected: true +tracksRegLiveness: true + +# CHECK: registers: +# Existing registers should be left untouched +# CHECK: - { id: 0, class: fpr } +#CHECK-NEXT: - { id: 1, class: fpr } +#CHECK-NEXT: - { id: 2, class: fpr } +#CHECK-NEXT: - { id: 3, class: fpr } +# The newly created reg should be on the same regbank/regclass as its origin. +#CHECK-NEXT: - { id: 4, class: fpr } + +registers: + - { id: 0, class: fpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: fpr } + +# CHECK: body: +# CHECK: %0(s32) = G_FCONSTANT float 1.0 +# CHECK-NEXT: %1(s32) = G_FADD %0, %0 + +# CHECK: bb.1: +# CHECK: %3(s32) = PHI %1(s32), %bb.0, %4(s32), %bb.1 +# CHECK: %4(s32) = G_FCONSTANT float 1.0 + +# CHECK-NEXT: %2(s32) = G_FADD %3, %1 +body: | + bb.0: + successors: %bb.1 + + %0(s32) = G_FCONSTANT float 1.0 + %1(s32) = G_FADD %0, %0 + + bb.1: + successors: %bb.1 + + %3(s32) = PHI %1(s32), %bb.0, %0(s32), %bb.1 + %2(s32) = G_FADD %3, %1 + G_BR %bb.1 +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir b/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir index 96436209451b0..c35d1719f84c8 100644 --- a/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir +++ b/test/CodeGen/AArch64/GlobalISel/select-pr32733.mir @@ -13,7 +13,6 @@ name: main alignment: 2 exposesReturnsTwice: false -noVRegs: false legalized: true regBankSelected: true selected: false diff --git a/test/CodeGen/AArch64/addcarry-crash.ll b/test/CodeGen/AArch64/addcarry-crash.ll new file mode 100644 index 0000000000000..ba833e0b5873c --- /dev/null +++ b/test/CodeGen/AArch64/addcarry-crash.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s | FileCheck %s +target triple = "arm64-apple-ios7.0" + +define i64 @foo(i64* nocapture readonly %ptr, i64 %a, i64 %b, i64 %c) local_unnamed_addr #0 { +; CHECK: ldr w8, [x0, #4] +; CHECK: lsr x9, x1, #32 +; CHECK: cmn x3, x2 +; CHECK: mul x8, x8, x9 +; CHECK: cinc x0, x8, hs +; CHECK: ret +entry: + %0 = lshr i64 %a, 32 + %1 = load i64, i64* %ptr, align 8 + %2 = lshr i64 %1, 32 + %3 = mul nuw i64 %2, %0 + %4 = add i64 %c, %b + %5 = icmp ult i64 %4, %c + %6 = zext i1 %5 to i64 + %7 = add i64 %3, %6 + ret i64 %7 +} + +attributes #0 = { norecurse nounwind readonly } diff --git a/test/CodeGen/AArch64/misched-fusion-aes.ll b/test/CodeGen/AArch64/misched-fusion-aes.ll index 1d8787212579a..bd7c69c910c0e 100644 --- a/test/CodeGen/AArch64/misched-fusion-aes.ll +++ b/test/CodeGen/AArch64/misched-fusion-aes.ll @@ -1,5 +1,7 @@ -; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57A72 -; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57A72 +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a53 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a73 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCORTEX ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1 declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k) @@ -72,22 +74,22 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, ret void ; CHECK-LABEL: aesea: -; CHECKA57A72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VA]] -; CHECKA57A72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VB]] -; CHECKA57A72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VC]] -; CHECKA57A72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VD]] -; CHECKA57A72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VE]] -; CHECKA57A72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VF]] -; CHECKA57A72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VG]] -; CHECKA57A72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VH]] +; CHECKCORTEX: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VA]] +; CHECKCORTEX: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VB]] +; CHECKCORTEX: aese [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VC]] +; CHECKCORTEX: aese [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VD]] +; CHECKCORTEX: aese [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VE]] +; CHECKCORTEX: aese [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VF]] +; CHECKCORTEX: aese [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VG]] +; CHECKCORTEX: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesmc {{v[0-7].16b}}, [[VH]] ; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VA]] @@ -173,22 +175,22 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, ret void ; CHECK-LABEL: aesda: -; CHECKA57A72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VA]] -; CHECKA57A72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VB]] -; CHECKA57A72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VC]] -; CHECKA57A72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VD]] -; CHECKA57A72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VE]] -; CHECKA57A72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VF]] -; CHECKA57A72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VG]] -; CHECKA57A72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} -; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VH]] +; CHECKCORTEX: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VA]] +; CHECKCORTEX: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VB]] +; CHECKCORTEX: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VC]] +; CHECKCORTEX: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VD]] +; CHECKCORTEX: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VE]] +; CHECKCORTEX: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VF]] +; CHECKCORTEX: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VG]] +; CHECKCORTEX: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKCORTEX-NEXT: aesimc {{v[0-7].16b}}, [[VH]] ; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VA]] diff --git a/test/CodeGen/AArch64/pr33172.ll b/test/CodeGen/AArch64/pr33172.ll new file mode 100644 index 0000000000000..1e1da78b28ff4 --- /dev/null +++ b/test/CodeGen/AArch64/pr33172.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s | FileCheck %s + +; CHECK-LABEL: pr33172 +; CHECK: ldp +; CHECK: stp + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios10.3.0" + +@main.b = external global [200 x float], align 8 +@main.x = external global [200 x float], align 8 + +; Function Attrs: nounwind ssp +define void @pr33172() local_unnamed_addr { +entry: + %wide.load8281058.3 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 12) to i64*), align 8 + %wide.load8291059.3 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 14) to i64*), align 8 + store i64 %wide.load8281058.3, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 12) to i64*), align 8 + store i64 %wide.load8291059.3, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 14) to i64*), align 8 + %wide.load8281058.4 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 16) to i64*), align 8 + %wide.load8291059.4 = load i64, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.b, i64 0, i64 18) to i64*), align 8 + store i64 %wide.load8281058.4, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 16) to i64*), align 8 + store i64 %wide.load8291059.4, i64* bitcast (float* getelementptr inbounds ([200 x float], [200 x float]* @main.x, i64 0, i64 18) to i64*), align 8 + tail call void @llvm.memset.p0i8.i64(i8* bitcast ([200 x float]* @main.b to i8*), i8 0, i64 undef, i32 8, i1 false) #2 + unreachable +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #1 + +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll index a3a78d326a628..02642142ae2cd 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #0 ; FUNC-LABEL: {{^}}ds_swizzle: -; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:100 +; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:swizzle(BITMASK_PERM,"00p11") ; CHECK: s_waitcnt lgkmcnt define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind { %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0 diff --git a/test/CodeGen/AMDGPU/merge-m0.mir b/test/CodeGen/AMDGPU/merge-m0.mir index 064db49924e15..720642ad1ddb9 100644 --- a/test/CodeGen/AMDGPU/merge-m0.mir +++ b/test/CodeGen/AMDGPU/merge-m0.mir @@ -50,7 +50,6 @@ name: test alignment: 0 exposesReturnsTwice: false -noVRegs: false legalized: false regBankSelected: false selected: false diff --git a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir index cd50e01032c38..cd0d410368c7d 100644 --- a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir +++ b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir @@ -86,7 +86,6 @@ name: sdwa_imm_operand alignment: 0 exposesReturnsTwice: false -noVRegs: false legalized: false regBankSelected: false selected: false @@ -248,7 +247,6 @@ body: | name: sdwa_sgpr_operand alignment: 0 exposesReturnsTwice: false -noVRegs: false legalized: false regBankSelected: false selected: false diff --git a/test/CodeGen/AMDGPU/waitcnt-permute.mir b/test/CodeGen/AMDGPU/waitcnt-permute.mir new file mode 100644 index 0000000000000..44dbd38f2d300 --- /dev/null +++ b/test/CodeGen/AMDGPU/waitcnt-permute.mir @@ -0,0 +1,33 @@ +# RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s + +--- | + define float @waitcnt-permute(i32 %x, i32 %y) { + entry: + %0 = call i32 @llvm.amdgcn.ds.bpermute(i32 %x, i32 %y) + %1 = bitcast i32 %0 to float + %2 = fadd float 1.000000e+00, %1 + ret float %2 + } + + declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) + +... +--- +# CHECK-LABEL: name: waitcnt-permute{{$}} +# CHECK: DS_BPERMUTE_B32 +# CHECK-NEXT: S_WAITCNT 127 + +name: waitcnt-permute +liveins: + - { reg: '%vgpr0' } + - { reg: '%vgpr1' } + - { reg: '%sgpr30_sgpr31' } +body: | + bb.0: + liveins: %vgpr0, %vgpr1, %sgpr30_sgpr31 + + %vgpr0 = DS_BPERMUTE_B32 killed %vgpr0, killed %vgpr1, 0, implicit %exec + %vgpr0 = V_ADD_F32_e32 1065353216, killed %vgpr0, implicit %exec + S_SETPC_B64_return killed %sgpr30_sgpr31, implicit killed %vgpr0 + +... diff --git a/test/CodeGen/ARM/cmpxchg-O0.ll b/test/CodeGen/ARM/cmpxchg-O0.ll index f8ad2bbbbe0e4..a3be72112c761 100644 --- a/test/CodeGen/ARM/cmpxchg-O0.ll +++ b/test/CodeGen/ARM/cmpxchg-O0.ll @@ -10,10 +10,11 @@ define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind { ; CHECK: dmb ish ; CHECK: uxtb [[DESIRED:r[0-9]+]], [[DESIRED]] ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: mov{{s?}} [[STATUS:r[0-9]+]], #0 ; CHECK: ldrexb [[OLD:r[0-9]+]], [r0] ; CHECK: cmp [[OLD]], [[DESIRED]] ; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]] -; CHECK: strexb [[STATUS:r[0-9]+]], r2, [r0] +; CHECK: strexb [[STATUS]], r2, [r0] ; CHECK: cmp{{(\.w)?}} [[STATUS]], #0 ; CHECK: bne [[RETRY]] ; CHECK: [[DONE]]: @@ -29,10 +30,11 @@ define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind ; CHECK: dmb ish ; CHECK: uxth [[DESIRED:r[0-9]+]], [[DESIRED]] ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: mov{{s?}} [[STATUS:r[0-9]+]], #0 ; CHECK: ldrexh [[OLD:r[0-9]+]], [r0] ; CHECK: cmp [[OLD]], [[DESIRED]] ; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]] -; CHECK: strexh [[STATUS:r[0-9]+]], r2, [r0] +; CHECK: strexh [[STATUS]], r2, [r0] ; CHECK: cmp{{(\.w)?}} [[STATUS]], #0 ; CHECK: bne [[RETRY]] ; CHECK: [[DONE]]: @@ -48,10 +50,11 @@ define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind ; CHECK: dmb ish ; CHECK-NOT: uxt ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]: +; CHECK: mov{{s?}} [[STATUS:r[0-9]+]], #0 ; CHECK: ldrex [[OLD:r[0-9]+]], [r0] ; CHECK: cmp [[OLD]], [[DESIRED]] ; CHECK: bne [[DONE:.LBB[0-9]+_[0-9]+]] -; CHECK: strex [[STATUS:r[0-9]+]], r2, [r0] +; CHECK: strex [[STATUS]], r2, [r0] ; CHECK: cmp{{(\.w)?}} [[STATUS]], #0 ; CHECK: bne [[RETRY]] ; CHECK: [[DONE]]: diff --git a/test/CodeGen/ARM/v6-jumptable-clobber.mir b/test/CodeGen/ARM/v6-jumptable-clobber.mir index 0e9bc42565f3b..6577ef8486713 100644 --- a/test/CodeGen/ARM/v6-jumptable-clobber.mir +++ b/test/CodeGen/ARM/v6-jumptable-clobber.mir @@ -190,7 +190,6 @@ name: foo alignment: 1 exposesReturnsTwice: false -noVRegs: true legalized: false regBankSelected: false selected: false @@ -289,7 +288,6 @@ body: | name: bar alignment: 1 exposesReturnsTwice: false -noVRegs: true legalized: false regBankSelected: false selected: false diff --git a/test/CodeGen/AVR/rot.ll b/test/CodeGen/AVR/rot.ll index e43daf3e6aa85..a7b77d97ba698 100644 --- a/test/CodeGen/AVR/rot.ll +++ b/test/CodeGen/AVR/rot.ll @@ -6,7 +6,7 @@ define i8 @rol8(i8 %val, i8 %amt) { ; CHECK: andi r22, 7 - ; CHECK-NEXT: cp r22, r0 + ; CHECK-NEXT: cpi r22, 0 ; CHECK-NEXT: breq LBB0_2 ; CHECK-NEXT: LBB0_1: @@ -32,7 +32,7 @@ define i8 @rol8(i8 %val, i8 %amt) { define i8 @ror8(i8 %val, i8 %amt) { ; CHECK: andi r22, 7 - ; CHECK-NEXT: cp r22, r0 + ; CHECK-NEXT: cpi r22, 0 ; CHECK-NEXT: breq LBB1_2 ; CHECK-NEXT: LBB1_1: diff --git a/test/CodeGen/Hexagon/invalid-dotnew-attempt.mir b/test/CodeGen/Hexagon/invalid-dotnew-attempt.mir new file mode 100644 index 0000000000000..2233e3289f112 --- /dev/null +++ b/test/CodeGen/Hexagon/invalid-dotnew-attempt.mir @@ -0,0 +1,17 @@ +# RUN: llc -march=hexagon -start-after if-converter %s -o - | FileCheck %s +# CHECK: p0 = r0 +# CHECK-NEXT: jumpr r31 + +# Make sure that the packetizer does not attempt to newify the J2_jumpr +# only because of the def-use of p0. + +--- +name: fred +tracksRegLiveness: true +body: | + bb.0: + liveins: %d0 + %p0 = C2_tfrrp %r0 + J2_jumpr %r31, implicit-def %pc, implicit %p0 +... + diff --git a/test/CodeGen/Hexagon/loop-idiom/pmpy-long-loop.ll b/test/CodeGen/Hexagon/loop-idiom/pmpy-long-loop.ll new file mode 100644 index 0000000000000..b25010f2a90fe --- /dev/null +++ b/test/CodeGen/Hexagon/loop-idiom/pmpy-long-loop.ll @@ -0,0 +1,62 @@ +; RUN: opt -march=hexagon -hexagon-loop-idiom -S < %s | FileCheck %s +; +; The number of nested selects caused the simplification loop to take +; more than the maximum number of iterations. This caused the compiler +; to crash under suspicion of an infinite loop. This (still reduced) +; testcase shows a legitimate case where this limit was exceeded. +; Instead of crashing, gracefully abort the simplification. +; +; Check for sane output. +; CHECK: define void @fred + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +define void @fred() unnamed_addr #0 { +b0: + %v1 = select i1 false, i32 undef, i32 2 + br label %b2 + +b2: ; preds = %b2, %b0 + %v3 = sext i16 undef to i32 + %v4 = add nsw i32 %v1, %v3 + %v5 = select i1 undef, i32 undef, i32 %v4 + %v6 = icmp slt i32 %v5, undef + %v7 = select i1 %v6, i32 %v5, i32 undef + %v8 = icmp slt i32 %v7, 0 + %v9 = select i1 %v8, i32 %v7, i32 0 + %v10 = sub i32 undef, undef + %v11 = add i32 %v10, %v9 + %v12 = sext i16 undef to i32 + %v13 = sext i16 undef to i32 + %v14 = add nsw i32 %v1, %v13 + %v15 = select i1 undef, i32 undef, i32 %v14 + %v16 = icmp slt i32 %v15, undef + %v17 = select i1 %v16, i32 %v15, i32 undef + %v18 = select i1 undef, i32 %v17, i32 %v12 + %v19 = add i32 undef, %v18 + %v20 = sext i16 undef to i32 + %v21 = sext i16 0 to i32 + %v22 = add nsw i32 %v1, %v21 + %v23 = sext i16 undef to i32 + %v24 = add nsw i32 %v1, %v23 + %v25 = select i1 undef, i32 undef, i32 %v24 + %v26 = icmp slt i32 %v25, %v22 + %v27 = select i1 %v26, i32 %v25, i32 %v22 + %v28 = icmp slt i32 %v27, %v20 + %v29 = select i1 %v28, i32 %v27, i32 %v20 + %v30 = add i32 undef, %v29 + %v31 = add i32 %v11, undef + %v32 = add i32 %v31, undef + %v33 = add i32 %v32, %v19 + %v34 = add i32 %v33, %v30 + %v35 = add nsw i32 %v34, 32768 + %v36 = icmp ult i32 %v35, 65536 + %v37 = select i1 %v36, i32 %v34, i32 undef + br i1 undef, label %b2, label %b38 + +b38: ; preds = %b2 + unreachable +} + +attributes #0 = { "target-cpu"="hexagonv60" } diff --git a/test/CodeGen/Hexagon/mul64-sext.ll b/test/CodeGen/Hexagon/mul64-sext.ll new file mode 100644 index 0000000000000..8bbe6649a1fbc --- /dev/null +++ b/test/CodeGen/Hexagon/mul64-sext.ll @@ -0,0 +1,93 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +target triple = "hexagon-unknown--elf" + +; CHECK-LABEL: mul_1 +; CHECK: r1:0 = mpy(r2,r0) +define i64 @mul_1(i64 %a0, i64 %a1) #0 { +b2: + %v3 = shl i64 %a0, 32 + %v4 = ashr exact i64 %v3, 32 + %v5 = shl i64 %a1, 32 + %v6 = ashr exact i64 %v5, 32 + %v7 = mul nsw i64 %v6, %v4 + ret i64 %v7 +} + +; CHECK-LABEL: mul_2 +; CHECK: r0 = memb(r0+#0) +; CHECK: r1:0 = mpy(r2,r0) +; CHECK: jumpr r31 +define i64 @mul_2(i8* %a0, i64 %a1) #0 { +b2: + %v3 = load i8, i8* %a0 + %v4 = sext i8 %v3 to i64 + %v5 = shl i64 %a1, 32 + %v6 = ashr exact i64 %v5, 32 + %v7 = mul nsw i64 %v6, %v4 + ret i64 %v7 +} + +; CHECK-LABEL: mul_acc_1 +; CHECK: r5:4 += mpy(r2,r0) +; CHECK: r1:0 = combine(r5,r4) +; CHECK: jumpr r31 +define i64 @mul_acc_1(i64 %a0, i64 %a1, i64 %a2) #0 { +b3: + %v4 = shl i64 %a0, 32 + %v5 = ashr exact i64 %v4, 32 + %v6 = shl i64 %a1, 32 + %v7 = ashr exact i64 %v6, 32 + %v8 = mul nsw i64 %v7, %v5 + %v9 = add i64 %a2, %v8 + ret i64 %v9 +} + +; CHECK-LABEL: mul_acc_2 +; CHECK: r2 = memw(r2+#0) +; CHECK: r5:4 += mpy(r2,r0) +; CHECK: r1:0 = combine(r5,r4) +; CHECK: jumpr r31 +define i64 @mul_acc_2(i64 %a0, i32* %a1, i64 %a2) #0 { +b3: + %v4 = shl i64 %a0, 32 + %v5 = ashr exact i64 %v4, 32 + %v6 = load i32, i32* %a1 + %v7 = sext i32 %v6 to i64 + %v8 = mul nsw i64 %v7, %v5 + %v9 = add i64 %a2, %v8 + ret i64 %v9 +} + +; CHECK-LABEL: mul_nac_1 +; CHECK: r5:4 -= mpy(r2,r0) +; CHECK: r1:0 = combine(r5,r4) +; CHECK: jumpr r31 +define i64 @mul_nac_1(i64 %a0, i64 %a1, i64 %a2) #0 { +b3: + %v4 = shl i64 %a0, 32 + %v5 = ashr exact i64 %v4, 32 + %v6 = shl i64 %a1, 32 + %v7 = ashr exact i64 %v6, 32 + %v8 = mul nsw i64 %v7, %v5 + %v9 = sub i64 %a2, %v8 + ret i64 %v9 +} + +; CHECK-LABEL: mul_nac_2 +; CHECK: r0 = memw(r0+#0) +; CHECK: r5:4 -= mpy(r2,r0) +; CHECK: r1:0 = combine(r5,r4) +; CHECK: jumpr r31 +define i64 @mul_nac_2(i32* %a0, i64 %a1, i64 %a2) #0 { +b3: + %v4 = load i32, i32* %a0 + %v5 = sext i32 %v4 to i64 + %v6 = shl i64 %a1, 32 + %v7 = ashr exact i64 %v6, 32 + %v8 = mul nsw i64 %v7, %v5 + %v9 = sub i64 %a2, %v8 + ret i64 %v9 +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/MIR/Generic/multiRunPass.mir b/test/CodeGen/MIR/Generic/multiRunPass.mir index bca007de80b7c..bd1c0d0b458e5 100644 --- a/test/CodeGen/MIR/Generic/multiRunPass.mir +++ b/test/CodeGen/MIR/Generic/multiRunPass.mir @@ -7,8 +7,8 @@ # This test ensures that the command line accepts # several run passes on the same command line and # actually create the proper pipeline for it. -# PSEUDO_PEEPHOLE: -expand-isel-pseudos -peephole-opt -# PEEPHOLE_PSEUDO: -peephole-opt -expand-isel-pseudos +# PSEUDO_PEEPHOLE: -expand-isel-pseudos {{(-machineverifier )?}}-peephole-opt +# PEEPHOLE_PSEUDO: -peephole-opt {{(-machineverifier )?}}-expand-isel-pseudos # Make sure there are no other passes happening after what we asked. # CHECK-NEXT: --- | diff --git a/test/CodeGen/Mips/compactbranches/empty-block.mir b/test/CodeGen/Mips/compactbranches/empty-block.mir index 7831e51e31579..7fb1afae91210 100644 --- a/test/CodeGen/Mips/compactbranches/empty-block.mir +++ b/test/CodeGen/Mips/compactbranches/empty-block.mir @@ -39,7 +39,6 @@ name: l5 alignment: 2 exposesReturnsTwice: false -noVRegs: true legalized: false regBankSelected: false selected: false diff --git a/test/CodeGen/PowerPC/expand-isel.ll b/test/CodeGen/PowerPC/expand-isel.ll index 553cc3c372e5b..c8707bda8e84a 100644 --- a/test/CodeGen/PowerPC/expand-isel.ll +++ b/test/CodeGen/PowerPC/expand-isel.ll @@ -212,13 +212,14 @@ cleanup: ret i32 %retval.0 ; CHECK-LABEL: @testComplexISEL -; CHECK: bc 12, 2, [[TRUE:.LBB[0-9]+]] -; CHECK-NEXT: b [[SUCCESSOR:.LBB[0-9]+]] -; CHECK-NEXT: [[TRUE]] -; CHECK-NEXT: addi r3, r12, 0 -; CHECK-NEXT: [[SUCCESSOR]] -; CHECK-NEXT: clrldi r3, r3, 32 -; CHECK-NEXT: blr +; CHECK-DAG: [[LI:r[0-9]+]], 1 +; CHECK-DAG: cmplwi [[LD:r[0-9]+]], 0 +; CHECK: beq cr0, [[EQ:.LBB[0-9_]+]] +; CHECK: blr +; CHECK: [[EQ]] +; CHECK: xor [[XOR:r[0-9]+]] +; CHECK: cntlzd [[CZ:r[0-9]+]], [[XOR]] +; CHECK: rldicl [[SH:r[0-9]+]], [[CZ]], 58, 63 } !1 = !{!2, !2, i64 0} diff --git a/test/CodeGen/PowerPC/logic-ops-on-compares.ll b/test/CodeGen/PowerPC/logic-ops-on-compares.ll new file mode 100644 index 0000000000000..df021c20ea86e --- /dev/null +++ b/test/CodeGen/PowerPC/logic-ops-on-compares.ll @@ -0,0 +1,130 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +; Function Attrs: nounwind +define signext i32 @logic_ne_32(i32 signext %a, i32 signext %b, i32 signext %c) { +; CHECK-LABEL: logic_ne_32: +; CHECK: xor r7, r3, r4 +; CHECK-NEXT: li r6, 55 +; CHECK-NEXT: xor r5, r5, r6 +; CHECK-NEXT: or r7, r7, r4 +; CHECK-NEXT: cntlzw r5, r5 +; CHECK-NEXT: cntlzw r6, r7 +; CHECK-NEXT: srwi r6, r6, 5 +; CHECK-NEXT: srwi r5, r5, 5 +; CHECK-NEXT: or. r5, r6, r5 +; CHECK-NEXT: bc 4, 1 +entry: + %tobool = icmp eq i32 %a, %b + %tobool1 = icmp eq i32 %b, 0 + %or.cond = and i1 %tobool, %tobool1 + %tobool3 = icmp eq i32 %c, 55 + %or.cond5 = or i1 %or.cond, %tobool3 + br i1 %or.cond5, label %if.end, label %if.then + +if.then: ; preds = %entry + %call = tail call signext i32 @foo(i32 signext %a) #2 + br label %return + +if.end: ; preds = %entry + %call4 = tail call signext i32 @bar(i32 signext %b) #2 + br label %return + +return: ; preds = %if.end, %if.then + %retval.0 = phi i32 [ %call4, %if.end ], [ %call, %if.then ] + ret i32 %retval.0 +} + +define void @neg_truncate_i32(i32 *%ptr) { +; CHECK-LABEL: neg_truncate_i32: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: lwz r3, 0(r3) +; CHECK-NEXT: rldicl. r3, r3, 0, 63 +; CHECK-NEXT: bclr 12, 2, 0 +; CHECK-NEXT: # BB#1: # %if.end29.thread136 +; CHECK-NEXT: .LBB1_2: # %if.end29 +entry: + %0 = load i32, i32* %ptr, align 4 + %rem17127 = and i32 %0, 1 + %cmp18 = icmp eq i32 %rem17127, 0 + br label %if.else + +if.else: ; preds = %entry + br i1 %cmp18, label %if.end29, label %if.end29.thread136 + +if.end29.thread136: ; preds = %if.else + unreachable + +if.end29: ; preds = %if.else + ret void + +} + +; Function Attrs: nounwind +define i64 @logic_ne_64(i64 %a, i64 %b, i64 %c) { +; CHECK-LABEL: logic_ne_64: +; CHECK: xor r7, r3, r4 +; CHECK-NEXT: li r6, 55 +; CHECK-NEXT: xor r5, r5, r6 +; CHECK-NEXT: or r7, r7, r4 +; CHECK-NEXT: cntlzd r6, r7 +; CHECK-NEXT: cntlzd r5, r5 +; CHECK-NEXT: rldicl r6, r6, 58, 63 +; CHECK-NEXT: rldicl r5, r5, 58, 63 +; CHECK-NEXT: or. r5, r6, r5 +; CHECK-NEXT: bc 4, 1 +entry: + %tobool = icmp eq i64 %a, %b + %tobool1 = icmp eq i64 %b, 0 + %or.cond = and i1 %tobool, %tobool1 + %tobool3 = icmp eq i64 %c, 55 + %or.cond5 = or i1 %or.cond, %tobool3 + br i1 %or.cond5, label %if.end, label %if.then + +if.then: ; preds = %entry + %call = tail call i64 @foo64(i64 %a) #2 + br label %return + +if.end: ; preds = %entry + %call4 = tail call i64 @bar64(i64 %b) #2 + br label %return + +return: ; preds = %if.end, %if.then + %retval.0 = phi i64 [ %call4, %if.end ], [ %call, %if.then ] + ret i64 %retval.0 +} + +define void @neg_truncate_i64(i64 *%ptr) { +; CHECK-LABEL: neg_truncate_i64: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: ld r3, 0(r3) +; CHECK-NEXT: rldicl. r3, r3, 0, 63 +; CHECK-NEXT: bclr 12, 2, 0 +; CHECK-NEXT: # BB#1: # %if.end29.thread136 +; CHECK-NEXT: .LBB3_2: # %if.end29 +entry: + %0 = load i64, i64* %ptr, align 4 + %rem17127 = and i64 %0, 1 + %cmp18 = icmp eq i64 %rem17127, 0 + br label %if.else + +if.else: ; preds = %entry + br i1 %cmp18, label %if.end29, label %if.end29.thread136 + +if.end29.thread136: ; preds = %if.else + unreachable + +if.end29: ; preds = %if.else + ret void + +} + +declare signext i32 @foo(i32 signext) +declare signext i32 @bar(i32 signext) +declare i64 @foo64(i64) +declare i64 @bar64(i64) diff --git a/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll new file mode 100644 index 0000000000000..3095429758f64 --- /dev/null +++ b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll @@ -0,0 +1,121 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +@zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4 +@zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4 +@zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4 +@zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4 +@zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4 +@zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4 +@zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4 +@zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4 + +; Function Attrs: nounwind readonly +declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1 + +; Validate with if(memcmp()) +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest01() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer1 to i8*), i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer2 to i8*), i64 16) + %not.tobool = icmp ne i32 %call, 0 + %. = zext i1 %not.tobool to i32 + ret i32 %. + + ; CHECK-LABEL: @zeroEqualityTest01 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr + ; CHECK: li 3, 0 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr +} + +; Validate with if(memcmp() == 0) +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest02() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16) + %not.cmp = icmp ne i32 %call, 0 + %. = zext i1 %not.cmp to i32 + ret i32 %. + + ; CHECK-LABEL: @zeroEqualityTest02 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr + ; CHECK: li 3, 0 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr +} + +; Validate with > 0 +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest03() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16) + %not.cmp = icmp slt i32 %call, 1 + %. = zext i1 %not.cmp to i32 + ret i32 %. + + ; CHECK-LABEL: @zeroEqualityTest03 + ; CHECK-LABEL: %res_block + ; CHECK: cmpld + ; CHECK-NEXT: li [[LI:[0-9]+]], 1 + ; CHECK-NEXT: li [[LI2:[0-9]+]], -1 + ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0 +} + +; Validate with < 0 +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest04() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer2 to i8*), i64 16) + %call.lobit = lshr i32 %call, 31 + %call.lobit.not = xor i32 %call.lobit, 1 + ret i32 %call.lobit.not + + ; CHECK-LABEL: @zeroEqualityTest04 + ; CHECK-LABEL: %res_block + ; CHECK: cmpld + ; CHECK-NEXT: li [[LI:[0-9]+]], 1 + ; CHECK-NEXT: li [[LI2:[0-9]+]], -1 + ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0 +} + +; Validate with memcmp()?: +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest05() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16) + %not.tobool = icmp eq i32 %call, 0 + %cond = zext i1 %not.tobool to i32 + ret i32 %cond + + ; CHECK-LABEL: @zeroEqualityTest05 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK: li 3, 0 +} + +; Validate with !memcmp()?: +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest06() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16) + %not.lnot = icmp ne i32 %call, 0 + %cond = zext i1 %not.lnot to i32 + ret i32 %cond + + ; CHECK-LABEL: @zeroEqualityTest06 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr + ; CHECK: li 3, 0 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr +} diff --git a/test/CodeGen/PowerPC/memcmp.ll b/test/CodeGen/PowerPC/memcmp.ll new file mode 100644 index 0000000000000..bae713cb2072c --- /dev/null +++ b/test/CodeGen/PowerPC/memcmp.ll @@ -0,0 +1,87 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s -check-prefix=CHECK + +; Check size 8 +; Function Attrs: nounwind readonly +define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 8) #2 + ret i32 %call + +; CHECK-LABEL: @test1 +; CHECK: ldbrx [[LOAD1:[0-9]+]] +; CHECK-NEXT: ldbrx [[LOAD2:[0-9]+]] +; CHECK-NEXT: li [[LI:[0-9]+]], 1 +; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4 +; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 +; CHECK-NEXT: extsw 3, [[ISEL2]] +; CHECK-NEXT: blr +} + +; Check size 4 +; Function Attrs: nounwind readonly +define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) #2 + ret i32 %call + +; CHECK-LABEL: @test2 +; CHECK: lwbrx [[LOAD1:[0-9]+]] +; CHECK-NEXT: lwbrx [[LOAD2:[0-9]+]] +; CHECK-NEXT: li [[LI:[0-9]+]], 1 +; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4 +; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 +; CHECK-NEXT: extsw 3, [[ISEL2]] +; CHECK-NEXT: blr +} + +; Check size 2 +; Function Attrs: nounwind readonly +define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 2) #2 + ret i32 %call + +; CHECK-LABEL: @test3 +; CHECK: lhbrx [[LOAD1:[0-9]+]] +; CHECK-NEXT: lhbrx [[LOAD2:[0-9]+]] +; CHECK-NEXT: li [[LI:[0-9]+]], 1 +; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: li [[LI2:[0-9]+]], -1 +; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4 +; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2 +; CHECK-NEXT: extsw 3, [[ISEL2]] +; CHECK-NEXT: blr +} + +; Check size 1 +; Function Attrs: nounwind readonly +define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 1) #2 + ret i32 %call + +; CHECK-LABEL: @test4 +; CHECK: lbz [[LOAD1:[0-9]+]] +; CHECK-NEXT: lbz [[LOAD2:[0-9]+]] +; CHECK-NEXT: subf [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: extsw 3, [[SUB]] +; CHECK-NEXT: blr +} + +; Function Attrs: nounwind readonly +declare signext i32 @memcmp(i8*, i8*, i64) #1 diff --git a/test/CodeGen/PowerPC/memcmpIR.ll b/test/CodeGen/PowerPC/memcmpIR.ll new file mode 100644 index 0000000000000..f052cc258df8d --- /dev/null +++ b/test/CodeGen/PowerPC/memcmpIR.ll @@ -0,0 +1,194 @@ +; RUN: llc -o - -mtriple=powerpc64le-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s +; RUN: llc -o - -mtriple=powerpc64-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s --check-prefix=CHECK-BE + +define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { +entry: + ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]]) + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-LABEL: res_block:{{.*}} + ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-NEXT: br label %endblock + + ; CHECK: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]] + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]] + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]]) + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE-LABEL: res_block:{{.*}} + ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-BE-NEXT: br label %endblock + + ; CHECK-BE: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]] + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]] + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 16) + ret i32 %call +} + +declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1 + +define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { + ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]]) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + ; CHECK-LABEL: res_block:{{.*}} + ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-NEXT: br label %endblock + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + ; CHECK-BE-LABEL: res_block:{{.*}} + ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-BE-NEXT: br label %endblock + +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) + ret i32 %call +} + +define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { + ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]]) + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-LABEL: res_block:{{.*}} + ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-NEXT: br label %endblock + + ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]]) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK: [[LOAD1:%[0-9]+]] = load i16, i16* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16* + ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD1]]) + ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD2]]) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[BSWAP1]] to i64 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[BSWAP2]] to i64 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK: [[LOAD1:%[0-9]+]] = load i8, i8* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8* + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: br label %endblock + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE-LABEL: res_block:{{.*}} + ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64 + ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 + ; CHECK-BE-NEXT: br label %endblock + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, i16* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, i8* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: br label %endblock + +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 15) + ret i32 %call +} + ; CHECK: call = tail call signext i32 @memcmp + ; CHECK-BE: call = tail call signext i32 @memcmp +define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) { + +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 65) + ret i32 %call +} + +define signext i32 @test5(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE) { + ; CHECK: call = tail call signext i32 @memcmp + ; CHECK-BE: call = tail call signext i32 @memcmp +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %conv = sext i32 %SIZE to i64 + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 %conv) + ret i32 %call +} diff --git a/test/CodeGen/PowerPC/ppc64-get-cache-line-size.ll b/test/CodeGen/PowerPC/ppc64-get-cache-line-size.ll new file mode 100644 index 0000000000000..7ca5332865caa --- /dev/null +++ b/test/CodeGen/PowerPC/ppc64-get-cache-line-size.ll @@ -0,0 +1,49 @@ +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-ppc-prefetching=true | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-ppc-prefetching=true -ppc-loop-prefetch-cache-line=64 | FileCheck %s -check-prefix=CHECK-DCBT +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -enable-ppc-prefetching=true | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -enable-ppc-prefetching=true -ppc-loop-prefetch-cache-line=64 | FileCheck %s -check-prefix=CHECK-DCBT +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -enable-ppc-prefetching=true | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -enable-ppc-prefetching=true -ppc-loop-prefetch-cache-line=64 | FileCheck %s -check-prefix=CHECK-DCBT +; RUN: llc < %s -march=ppc64 -mcpu=a2 -enable-ppc-prefetching=true | FileCheck %s -check-prefix=CHECK-DCBT + +; Function Attrs: nounwind +define signext i32 @check_cache_line() local_unnamed_addr { +entry: + %call = tail call i32* bitcast (i32* (...)* @magici to i32* ()*)() + %call115 = tail call signext i32 bitcast (i32 (...)* @iter to i32 ()*)() + %cmp16 = icmp sgt i32 %call115, 0 + br i1 %cmp16, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %add5, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %res.017 = phi i32 [ %add5, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %call, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %res.017 + %1 = add nuw nsw i64 %indvars.iv, 16 + %arrayidx4 = getelementptr inbounds i32, i32* %call, i64 %1 + %2 = load i32, i32* %arrayidx4, align 4 + %add5 = add nsw i32 %add, %2 + %indvars.iv.next = add nuw i64 %indvars.iv, 1 + %call1 = tail call signext i32 bitcast (i32 (...)* @iter to i32 ()*)() + %3 = sext i32 %call1 to i64 + %cmp = icmp slt i64 %indvars.iv.next, %3 + br i1 %cmp, label %for.body, label %for.cond.cleanup +; CHECK-LABEL: check_cache_line +; CHECK: dcbt +; CHECK-NOT: dcbt +; CHECK: blr +; CHECK-DCBT-LABEL: check_cache_line +; CHECK-DCBT: dcbt +; CHECK-DCBT: dcbt +; CHECK-DCBT: blr +} + +declare i32* @magici(...) local_unnamed_addr + +declare signext i32 @iter(...) local_unnamed_addr + diff --git a/test/CodeGen/PowerPC/pristine-and-livein.mir b/test/CodeGen/PowerPC/pristine-and-livein.mir deleted file mode 100644 index 6d93bb68c102c..0000000000000 --- a/test/CodeGen/PowerPC/pristine-and-livein.mir +++ /dev/null @@ -1,330 +0,0 @@ -# RUN: llc -run-pass=post-RA-sched %s -o - | FileCheck %s - -# CHECK: callee-saved-register: '[[REG:%x[0-9]+]]' -# CHECK: callee-saved-register: '{{%x[0-9]+}}' -# CHECK-NOT: [[REG]] = LI8 0 -# CHECK: STD killed [[REG]], ---- | - ; ModuleID = '<stdin>' - source_filename = "bugpoint-output-4d91ae2.bc" - target datalayout = "e-m:e-i64:64-n32:64" - target triple = "powerpc64le--linux-gnu" - - ; Function Attrs: norecurse nounwind readonly - define i64 @adler32_z(i64 %adler, i8* readonly %buf, i64 %len) local_unnamed_addr #0 { - entry: - %shr = lshr i64 %adler, 16 - %and = and i64 %shr, 65535 - %and1 = and i64 %adler, 65535 - br i1 undef, label %if.then, label %if.end15 - - if.then: ; preds = %entry - %add5 = add nsw i64 %and1, %and - %sub9 = add nsw i64 %add5, 281474976645135 - %shl = shl i64 %add5, 16 - %or = or i64 %shl, %and1 - br label %cleanup - - if.end15: ; preds = %entry - br i1 undef, label %while.cond.preheader, label %while.cond30.preheader - - while.cond30.preheader: ; preds = %if.end15 - br i1 undef, label %while.body33.preheader, label %while.body109.preheader - - while.body33.preheader: ; preds = %while.cond30.preheader - br label %while.body33 - - while.cond.preheader: ; preds = %if.end15 - %sub25 = add i64 %and1, -65521 - %rem = urem i64 %and, 65521 - %shl27 = shl nuw nsw i64 %rem, 16 - %or28 = or i64 %shl27, %and1 - br label %cleanup - - while.body33: ; preds = %do.end, %while.body33.preheader - %indvar = phi i64 [ %indvar.next, %do.end ], [ 0, %while.body33.preheader ] - %sum2.2385 = phi i64 [ %rem102, %do.end ], [ %and, %while.body33.preheader ] - %len.addr.1384 = phi i64 [ %sub34, %do.end ], [ %len, %while.body33.preheader ] - %buf.addr.1383 = phi i8* [ %scevgep390, %do.end ], [ %buf, %while.body33.preheader ] - %adler.addr.3382 = phi i64 [ %rem101, %do.end ], [ %and1, %while.body33.preheader ] - %0 = mul i64 %indvar, 5552 - %1 = add i64 %0, -13 - %scevgep2 = getelementptr i8, i8* %buf, i64 %1 - %sub34 = add i64 %len.addr.1384, -5552 - call void @llvm.ppc.mtctr.i64(i64 347) - br label %do.body - - do.body: ; preds = %do.body, %while.body33 - %adler.addr.4 = phi i64 [ %adler.addr.3382, %while.body33 ], [ %add49, %do.body ] - %sum2.3 = phi i64 [ %sum2.2385, %while.body33 ], [ %add98, %do.body ] - %tmp15.phi = phi i8* [ %scevgep2, %while.body33 ], [ %tmp15.inc, %do.body ] - %tmp15.inc = getelementptr i8, i8* %tmp15.phi, i64 16 - %add38 = add i64 %adler.addr.4, %sum2.3 - %add42 = add i64 %add38, %adler.addr.4 - %add46 = add i64 %add42, %adler.addr.4 - %tmp15 = load i8, i8* %tmp15.inc, align 1, !tbaa !1 - %conv48 = zext i8 %tmp15 to i64 - %add49 = add i64 %adler.addr.4, %conv48 - %add50 = add i64 %add46, %add49 - %add54 = add i64 %add50, %add49 - %add58 = add i64 %add54, %add49 - %add62 = add i64 %add58, %add49 - %add66 = add i64 %add62, %add49 - %add70 = add i64 %add66, %add49 - %add74 = add i64 %add70, %add49 - %add78 = add i64 %add74, %add49 - %add82 = add i64 %add78, %add49 - %add86 = add i64 %add82, %add49 - %add90 = add i64 %add86, %add49 - %add94 = add i64 %add90, %add49 - %add98 = add i64 %add94, %add49 - %2 = call i1 @llvm.ppc.is.decremented.ctr.nonzero() - br i1 %2, label %do.body, label %do.end - - do.end: ; preds = %do.body - %scevgep390 = getelementptr i8, i8* %buf.addr.1383, i64 5552 - %rem101 = urem i64 %add49, 65521 - %rem102 = urem i64 %add98, 65521 - %cmp31 = icmp ugt i64 %sub34, 5551 - %indvar.next = add i64 %indvar, 1 - br i1 %cmp31, label %while.body33, label %while.end103 - - while.end103: ; preds = %do.end - br i1 undef, label %if.end188, label %while.body109.preheader - - while.body109.preheader: ; preds = %while.end103, %while.cond30.preheader - %buf.addr.1.lcssa394400 = phi i8* [ %buf, %while.cond30.preheader ], [ %scevgep390, %while.end103 ] - %arrayidx151 = getelementptr inbounds i8, i8* %buf.addr.1.lcssa394400, i64 10 - %tmp45 = load i8, i8* %arrayidx151, align 1, !tbaa !1 - %conv152 = zext i8 %tmp45 to i64 - br label %while.body109 - - while.body109: ; preds = %while.body109, %while.body109.preheader - %adler.addr.5373 = phi i64 [ %add153, %while.body109 ], [ undef, %while.body109.preheader ] - %add153 = add i64 %adler.addr.5373, %conv152 - br label %while.body109 - - if.end188: ; preds = %while.end103 - %shl189 = shl nuw nsw i64 %rem102, 16 - %or190 = or i64 %shl189, %rem101 - br label %cleanup - - cleanup: ; preds = %if.end188, %while.cond.preheader, %if.then - %retval.0 = phi i64 [ %or, %if.then ], [ %or28, %while.cond.preheader ], [ %or190, %if.end188 ] - ret i64 %retval.0 - } - - ; Function Attrs: nounwind - declare void @llvm.ppc.mtctr.i64(i64) #1 - - ; Function Attrs: nounwind - declare i1 @llvm.ppc.is.decremented.ctr.nonzero() #1 - - ; Function Attrs: nounwind - declare void @llvm.stackprotector(i8*, i8**) #1 - - attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" } - attributes #1 = { nounwind } - - !llvm.ident = !{!0} - - !0 = !{!"clang version 5.0.0 "} - !1 = !{!2, !2, i64 0} - !2 = !{!"omnipotent char", !3, i64 0} - !3 = !{!"Simple C/C++ TBAA"} - -... ---- -name: adler32_z -alignment: 4 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true -liveins: - - { reg: '%x3' } - - { reg: '%x4' } - - { reg: '%x5' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false -fixedStack: - - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%x30' } - - { id: 1, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '%x29' } - - { id: 2, offset: -8, size: 8, alignment: 8, isImmutable: true, isAliased: false } -body: | - bb.0.entry: - successors: %bb.1.if.then(0x40000000), %bb.3.if.end15(0x40000000) - liveins: %x3, %x4, %x5, %x29, %x30 - - %x6 = RLWINM8 %x3, 16, 16, 31 - %x3 = RLDICL killed %x3, 0, 48 - BC undef %cr5lt, %bb.3.if.end15 - - bb.1.if.then: - successors: %bb.2.if.then(0x80000000) - liveins: %x3, %x6, %x29, %x30 - - %x4 = ADD8 %x3, killed %x6 - - bb.2.if.then: - liveins: %lr8, %rm, %x3, %x4 - - %x4 = RLDICR killed %x4, 16, 47 - %x3 = OR8 killed %x4, killed %x3 - BLR8 implicit %lr8, implicit %rm, implicit %x3 - - bb.3.if.end15: - successors: %bb.6.while.cond.preheader(0x40000000), %bb.4.while.cond30.preheader(0x40000000) - liveins: %x3, %x4, %x5, %x6, %x29, %x30 - - BC undef %cr5lt, %bb.6.while.cond.preheader - - bb.4.while.cond30.preheader: - successors: %bb.7.while.body33.preheader(0x40000000), %bb.5(0x40000000) - liveins: %x3, %x4, %x5, %x6, %x29, %x30 - - BCn undef %cr5lt, %bb.7.while.body33.preheader - - bb.5: - successors: %bb.12.while.body109.preheader(0x80000000) - liveins: %x4, %x29, %x30 - - %x7 = OR8 %x4, killed %x4 - B %bb.12.while.body109.preheader - - bb.6.while.cond.preheader: - successors: %bb.2.if.then(0x80000000) - liveins: %x3, %x6, %x29, %x30 - - %x4 = LIS8 15 - %x4 = ORI8 killed %x4, 225 - %x4 = RLDICR killed %x4, 32, 31 - %x4 = ORIS8 killed %x4, 3375 - %x4 = ORI8 killed %x4, 50637 - %x4 = MULHDU %x6, killed %x4 - %x5 = SUBF8 %x4, %x6 - %x5 = RLDICL killed %x5, 63, 1 - %x4 = ADD8 killed %x5, killed %x4 - %x5 = LI8 0 - %x4 = RLDICL killed %x4, 49, 15 - %x5 = ORI8 killed %x5, 65521 - %x4 = MULLD killed %x4, killed %x5 - %x4 = SUBF8 killed %x4, killed %x6 - B %bb.2.if.then - - bb.7.while.body33.preheader: - successors: %bb.8.while.body33(0x80000000) - liveins: %x3, %x4, %x5, %x6, %x29, %x30 - - STD killed %x29, -24, %x1 :: (store 8 into %fixed-stack.1) - STD killed %x30, -16, %x1 :: (store 8 into %fixed-stack.0, align 16) - %x7 = LIS8 15 - %x7 = ORI8 killed %x7, 225 - %x7 = RLDICR killed %x7, 32, 31 - %x8 = LI8 0 - %x7 = ORIS8 killed %x7, 3375 - %x9 = LI8 347 - %x10 = ORI8 killed %x7, 50637 - %x11 = ORI8 %x8, 65521 - %x7 = OR8 %x4, %x4 - - bb.8.while.body33: - successors: %bb.9.do.body(0x80000000) - liveins: %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11 - - %x12 = MULLI8 %x8, 5552 - %x12 = ADD8 %x4, killed %x12 - %x12 = ADDI8 killed %x12, -13 - %x5 = ADDI8 killed %x5, -5552 - MTCTR8loop %x9, implicit-def dead %ctr8 - - bb.9.do.body: - successors: %bb.9.do.body(0x7c000000), %bb.10.do.end(0x04000000) - liveins: %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11, %x12 - - %x0, %x12 = LBZU8 16, killed %x12 :: (load 1 from %ir.tmp15.inc, !tbaa !1) - %x6 = ADD8 %x3, killed %x6 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x3 = ADD8 killed %x3, killed %x0 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - %x6 = ADD8 killed %x6, %x3 - BDNZ8 %bb.9.do.body, implicit-def %ctr8, implicit %ctr8 - - bb.10.do.end: - successors: %bb.8.while.body33(0x7c000000), %bb.11.while.end103(0x04000000) - liveins: %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11 - - %x12 = MULHDU %x3, %x10 - %x0 = MULHDU %x6, %x10 - %x30 = SUBF8 %x12, %x3 - %x29 = SUBF8 %x0, %x6 - %x30 = RLDICL killed %x30, 63, 1 - %x29 = RLDICL killed %x29, 63, 1 - %x12 = ADD8 killed %x30, killed %x12 - %x0 = ADD8 killed %x29, killed %x0 - %cr0 = CMPLDI %x5, 5551 - %x12 = RLDICL killed %x12, 49, 15 - %x0 = RLDICL killed %x0, 49, 15 - %x12 = MULLD killed %x12, %x11 - %x0 = MULLD killed %x0, %x11 - %x7 = ADDI8 killed %x7, 5552 - %x3 = SUBF8 killed %x12, killed %x3 - %x6 = SUBF8 killed %x0, killed %x6 - %x8 = ADDI8 killed %x8, 1 - BCC 44, killed %cr0, %bb.8.while.body33 - - bb.11.while.end103: - successors: %bb.14.if.end188(0x40000000), %bb.12.while.body109.preheader(0x40000000) - liveins: %x3, %x6, %x7 - - %x30 = LD -16, %x1 :: (load 8 from %fixed-stack.0, align 16) - %x29 = LD -24, %x1 :: (load 8 from %fixed-stack.1) - BC undef %cr5lt, %bb.14.if.end188 - - bb.12.while.body109.preheader: - successors: %bb.13.while.body109(0x80000000) - liveins: %x7, %x29, %x30 - - %x3 = LBZ8 10, killed %x7 :: (load 1 from %ir.arrayidx151, !tbaa !1) - %x4 = IMPLICIT_DEF - - bb.13.while.body109: - successors: %bb.13.while.body109(0x80000000) - liveins: %x3, %x4, %x29, %x30 - - %x4 = ADD8 killed %x4, %x3 - B %bb.13.while.body109 - - bb.14.if.end188: - liveins: %x3, %x6, %x29, %x30 - - %x4 = RLDICR killed %x6, 16, 47 - %x3 = OR8 killed %x4, killed %x3 - BLR8 implicit %lr8, implicit %rm, implicit %x3 - -... diff --git a/test/CodeGen/PowerPC/testComparesieqsll.ll b/test/CodeGen/PowerPC/testComparesieqsll.ll new file mode 100644 index 0000000000000..57c7365eff03a --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesieqsll.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesieqsll.c' + +@glob = common local_unnamed_addr global i64 0, align 8 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsll(i64 %a, i64 %b) { +; CHECK-LABEL: test_ieqsll: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsll_sext(i64 %a, i64 %b) { +; CHECK-LABEL: test_ieqsll_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsll_z(i64 %a) { +; CHECK-LABEL: test_ieqsll_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsll_sext_z(i64 %a) { +; CHECK-LABEL: test_ieqsll_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsll_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_ieqsll_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsll_sext_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_ieqsll_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsll_z_store(i64 %a) { +; CHECK-LABEL: test_ieqsll_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsll_sext_z_store(i64 %a) { +; CHECK-LABEL: test_ieqsll_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesiequll.ll b/test/CodeGen/PowerPC/testComparesiequll.ll new file mode 100644 index 0000000000000..c289290718455 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesiequll.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesiequll.c' + +@glob = common local_unnamed_addr global i64 0, align 8 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequll(i64 %a, i64 %b) { +; CHECK-LABEL: test_iequll: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequll_sext(i64 %a, i64 %b) { +; CHECK-LABEL: test_iequll_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequll_z(i64 %a) { +; CHECK-LABEL: test_iequll_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequll_sext_z(i64 %a) { +; CHECK-LABEL: test_iequll_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_iequll_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_iequll_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequll_sext_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_iequll_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequll_z_store(i64 %a) { +; CHECK-LABEL: test_iequll_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequll_sext_z_store(i64 %a) { +; CHECK-LABEL: test_iequll_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} diff --git a/test/CodeGen/PowerPC/testCompareslleqsll.ll b/test/CodeGen/PowerPC/testCompareslleqsll.ll new file mode 100644 index 0000000000000..4797ddfbfe970 --- /dev/null +++ b/test/CodeGen/PowerPC/testCompareslleqsll.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i64 0, align 8 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsll(i64 %a, i64 %b) { +; CHECK-LABEL: test_lleqsll: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsll_sext(i64 %a, i64 %b) { +; CHECK-LABEL: test_lleqsll_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsll_z(i64 %a) { +; CHECK-LABEL: test_lleqsll_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsll_sext_z(i64 %a) { +; CHECK-LABEL: test_lleqsll_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsll_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_lleqsll_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsll_sext_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_lleqsll_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsll_z_store(i64 %a) { +; CHECK-LABEL: test_lleqsll_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsll_sext_z_store(i64 %a) { +; CHECK-LABEL: test_lleqsll_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesllequll.ll b/test/CodeGen/PowerPC/testComparesllequll.ll new file mode 100644 index 0000000000000..4dc7be69d2c8d --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesllequll.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i64 0, align 8 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequll(i64 %a, i64 %b) { +; CHECK-LABEL: test_llequll: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequll_sext(i64 %a, i64 %b) { +; CHECK-LABEL: test_llequll_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequll_z(i64 %a) { +; CHECK-LABEL: test_llequll_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequll_sext_z(i64 %a) { +; CHECK-LABEL: test_llequll_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind +define void @test_llequll_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_llequll_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequll_sext_store(i64 %a, i64 %b) { +; CHECK-LABEL: test_llequll_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, %b + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequll_z_store(i64 %a) { +; CHECK-LABEL: test_llequll_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzd r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicl r3, r3, 58, 63 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = zext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequll_sext_z_store(i64 %a) { +; CHECK-LABEL: test_llequll_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: addic r3, r3, -1 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: subfe r3, r3, r3 +; CHECK-NEXT: std r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i64 %a, 0 + %conv1 = sext i1 %cmp to i64 + store i64 %conv1, i64* @glob, align 8 + ret void +} diff --git a/test/CodeGen/PowerPC/vec_xxpermdi.ll b/test/CodeGen/PowerPC/vec_xxpermdi.ll new file mode 100644 index 0000000000000..9be2a1864a04e --- /dev/null +++ b/test/CodeGen/PowerPC/vec_xxpermdi.ll @@ -0,0 +1,307 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | \ +; RUN: FileCheck %s -check-prefix=CHECK-LE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | \ +; RUN: FileCheck %s -check-prefix=CHECK-BE + +; Possible LE ShuffleVector masks (Case 1): +; ShuffleVector((vector double)a, (vector double)b, 3, 1) +; ShuffleVector((vector double)a, (vector double)b, 2, 1) +; ShuffleVector((vector double)a, (vector double)b, 3, 0) +; ShuffleVector((vector double)a, (vector double)b, 2, 0) +; which targets at: +; xxpermdi a, b, 0 +; xxpermdi a, b, 1 +; xxpermdi a, b, 2 +; xxpermdi a, b, 3 +; Possible LE Swap ShuffleVector masks (Case 2): +; ShuffleVector((vector double)a, (vector double)b, 1, 3) +; ShuffleVector((vector double)a, (vector double)b, 0, 3) +; ShuffleVector((vector double)a, (vector double)b, 1, 2) +; ShuffleVector((vector double)a, (vector double)b, 0, 2) +; which targets at: +; xxpermdi b, a, 0 +; xxpermdi b, a, 1 +; xxpermdi b, a, 2 +; xxpermdi b, a, 3 +; Possible LE ShuffleVector masks when a == b, b is undef (Case 3): +; ShuffleVector((vector double)a, (vector double)a, 1, 1) +; ShuffleVector((vector double)a, (vector double)a, 0, 1) +; ShuffleVector((vector double)a, (vector double)a, 1, 0) +; ShuffleVector((vector double)a, (vector double)a, 0, 0) +; which targets at: +; xxpermdi a, a, 0 +; xxpermdi a, a, 1 +; xxpermdi a, a, 2 +; xxpermdi a, a, 3 + +; Possible BE ShuffleVector masks (Case 4): +; ShuffleVector((vector double)a, (vector double)b, 0, 2) +; ShuffleVector((vector double)a, (vector double)b, 0, 3) +; ShuffleVector((vector double)a, (vector double)b, 1, 2) +; ShuffleVector((vector double)a, (vector double)b, 1, 3) +; which targets at: +; xxpermdi a, b, 0 +; xxpermdi a, b, 1 +; xxpermdi a, b, 2 +; xxpermdi a, b, 3 +; Possible BE Swap ShuffleVector masks (Case 5): +; ShuffleVector((vector double)a, (vector double)b, 2, 0) +; ShuffleVector((vector double)a, (vector double)b, 3, 0) +; ShuffleVector((vector double)a, (vector double)b, 2, 1) +; ShuffleVector((vector double)a, (vector double)b, 3, 1) +; which targets at: +; xxpermdi b, a, 0 +; xxpermdi b, a, 1 +; xxpermdi b, a, 2 +; xxpermdi b, a, 3 +; Possible BE ShuffleVector masks when a == b, b is undef (Case 6): +; ShuffleVector((vector double)a, (vector double)a, 0, 0) +; ShuffleVector((vector double)a, (vector double)a, 0, 1) +; ShuffleVector((vector double)a, (vector double)a, 1, 0) +; ShuffleVector((vector double)a, (vector double)a, 1, 1) +; which targets at: +; xxpermdi a, a, 0 +; xxpermdi a, a, 1 +; xxpermdi a, a, 2 +; xxpermdi a, a, 3 + +define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 1> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_0 +; CHECK-LE: xxmrghd 34, 34, 35 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 1> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_1 +; CHECK-LE: xxpermdi 34, 34, 35, 1 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 0> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_2 +; CHECK-LE: xxpermdi 34, 34, 35, 2 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 0> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_3 +; CHECK-LE: xxmrgld 34, 34, 35 +; CHECK-LE: blr +} + +define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 3> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_0 +; CHECK-LE: xxmrghd 34, 35, 34 +; CHECK-LE: blr +} + +define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 3> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_1 +; CHECK-LE: xxpermdi 34, 35, 34, 1 +; CHECK-LE: blr +} + +define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 2> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_2 +; CHECK-LE: xxpermdi 34, 35, 34, 2 +; CHECK-LE: blr +} + +define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 2> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_3 +; CHECK-LE: xxmrgld 34, 35, 34 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_0(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 1> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_0 +; CHECK-LE: xxspltd 34, 34, 0 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_1(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 1> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_1 +; CHECK-LE: blr +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_2(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 0> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_2 +; CHCECK-LE: xxswapd 34, 34 +} + +define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_3(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 0> + ret <2 x double> %0 +; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_3 +; CHECK-LE: xxspltd 34, 34, 1 +; CHECK-LE: blr +} + +; Start testing BE +define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 2> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_0 +; CHECK-BE: xxmrghd 34, 34, 35 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 3> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_1 +; CHECK-BE: xxpermdi 34, 34, 35, 1 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 2> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_2 +; CHECK-BE: xxpermdi 34, 34, 35, 2 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 3> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_3 +; CHECK-BE: xxmrgld 34, 34, 35 +; CHECK-BE: blr +} + +define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 0> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_0 +; CHECK-BE: xxmrghd 34, 35, 34 +; CHECK-BE: blr +} + +define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 1> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_1 +; CHECK-BE: xxpermdi 34, 35, 34, 1 +; CHECK-BE: blr +} + +define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 0> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_2 +; CHECK-BE: xxpermdi 34, 35, 34, 2 +; CHECK-BE: blr +} + +define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 1> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_3 +; CHECK-BE: xxmrgld 34, 35, 34 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_0(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 0> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_0 +; CHECK-BE: xxspltd 34, 34, 0 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_1(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 1> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_1 +; CHECK-BE: blr +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_2(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 0> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_2 +; CHCECK-LE: xxswapd 34, 34 +} + +define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_3(<2 x double> %VA) { + entry: + %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 1> + ret <2 x double> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_3 +; CHECK-BE: xxspltd 34, 34, 1 +; CHECK-BE: blr +} + +; More test cases to test different types of vector inputs +define <16 x i8> @test_be_vec_xxpermdi_v16i8_v16i8(<16 x i8> %VA, <16 x i8> %VB) { + entry: + %0 = shufflevector <16 x i8> %VA, <16 x i8> %VB,<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19> + ret <16 x i8> %0 +; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v16i8_v16i8 +; CHECK-BE: xxpermdi 34, 34, 35, 1 +; CHECK-BE: blr +} + +define <8 x i16> @test_le_swap_vec_xxpermdi_v8i16_v8i16(<8 x i16> %VA, <8 x i16> %VB) { + entry: + %0 = shufflevector <8 x i16> %VA, <8 x i16> %VB,<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> + ret <8 x i16> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v8i16_v8i16 +; CHECK-LE: xxpermdi 34, 35, 34, 1 +; CHECK-LE: blr +} + +define <4 x i32> @test_le_swap_vec_xxpermdi_v4i32_v4i32(<4 x i32> %VA, <4 x i32> %VB) { + entry: + %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB,<4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x i32> %0 +; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v4i32_v4i32 +; CHECK-LE: xxpermdi 34, 35, 34, 1 +; CHECK-LE: blr +} diff --git a/test/CodeGen/Thumb2/tbb-removeadd.mir b/test/CodeGen/Thumb2/tbb-removeadd.mir index 89ed987205394..1060667913439 100644 --- a/test/CodeGen/Thumb2/tbb-removeadd.mir +++ b/test/CodeGen/Thumb2/tbb-removeadd.mir @@ -39,7 +39,6 @@ name: Func alignment: 1 exposesReturnsTwice: false -noVRegs: true legalized: false regBankSelected: false selected: false diff --git a/test/CodeGen/X86/2007-01-08-InstrSched.ll b/test/CodeGen/X86/2007-01-08-InstrSched.ll index 4ec703921e29f..24aa5b98d0bb8 100644 --- a/test/CodeGen/X86/2007-01-08-InstrSched.ll +++ b/test/CodeGen/X86/2007-01-08-InstrSched.ll @@ -13,10 +13,10 @@ define float @foo(float %x) nounwind { ; CHECK: mulss ; CHECK: mulss -; CHECK: mulss -; CHECK: mulss ; CHECK: addss +; CHECK: mulss ; CHECK: addss +; CHECK: mulss ; CHECK: addss ; CHECK: ret } diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll index bc394f6e156fb..6c60aed67a7ba 100644 --- a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll +++ b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll @@ -5,7 +5,6 @@ define void @test_void_return() { ; CHECK-LABEL: name: test_void_return ; CHECK: alignment: 4 ; CHECK-NEXT: exposesReturnsTwice: false -; CHECK-NEXT: noVRegs: false ; CHECK-NEXT: legalized: false ; CHECK-NEXT: regBankSelected: false ; CHECK-NEXT: selected: false diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll index b9f7fc68cf689..ad82b8cfb775e 100644 --- a/test/CodeGen/X86/add-of-carry.ll +++ b/test/CodeGen/X86/add-of-carry.ll @@ -9,9 +9,11 @@ define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp { ; CHECK-LABEL: test1: ; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: adcl $0, %eax +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: adcl %ecx, %eax ; CHECK-NEXT: retl %add4 = add i32 %x, %sum %cmp = icmp ult i32 %add4, %x diff --git a/test/CodeGen/X86/addcarry.ll b/test/CodeGen/X86/addcarry.ll index 3f4ee362e230f..3c84af4aa9ec6 100644 --- a/test/CodeGen/X86/addcarry.ll +++ b/test/CodeGen/X86/addcarry.ll @@ -86,21 +86,14 @@ entry: define %scalar @pr31719(%scalar* nocapture readonly %this, %scalar %arg.b) { ; CHECK-LABEL: pr31719: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: xorl %r10d, %r10d -; CHECK-NEXT: addq 8(%rsi), %rcx -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: addq 16(%rsi), %r8 -; CHECK-NEXT: setb %al -; CHECK-NEXT: addq 24(%rsi), %r9 ; CHECK-NEXT: addq (%rsi), %rdx -; CHECK-NEXT: adcq $0, %rcx -; CHECK-NEXT: adcq %r8, %r10 -; CHECK-NEXT: adcq %r9, %rax +; CHECK-NEXT: adcq 8(%rsi), %rcx +; CHECK-NEXT: adcq 16(%rsi), %r8 +; CHECK-NEXT: adcq 24(%rsi), %r9 ; CHECK-NEXT: movq %rdx, (%rdi) ; CHECK-NEXT: movq %rcx, 8(%rdi) -; CHECK-NEXT: movq %r10, 16(%rdi) -; CHECK-NEXT: movq %rax, 24(%rdi) +; CHECK-NEXT: movq %r8, 16(%rdi) +; CHECK-NEXT: movq %r9, 24(%rdi) ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: retq entry: @@ -190,9 +183,9 @@ entry: define i64 @shiftadd(i64 %a, i64 %b, i64 %c, i64 %d) { ; CHECK-LABEL: shiftadd: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: leaq (%rdx,%rcx), %rax ; CHECK-NEXT: addq %rsi, %rdi -; CHECK-NEXT: adcq $0, %rax +; CHECK-NEXT: adcq %rcx, %rdx +; CHECK-NEXT: movq %rdx, %rax ; CHECK-NEXT: retq entry: %0 = zext i64 %a to i128 diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index 2aaf14001758f..aa28ef5175ed6 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -135,88 +135,87 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) { define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm3 +; SSE2-NEXT: movdqa 16(%rdi), %xmm8 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm12 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm11, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm2, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm6, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: paddd %xmm11, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: paddd %xmm10, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm13 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: paddd %xmm14, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm3 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm13 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm7 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm7, %xmm3 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm3 ; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm0 ; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm9 ; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm13 +; SSE2-NEXT: packuswb %xmm9, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm13, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm7, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -259,198 +258,183 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; SSE2-LABEL: avg_v64i8: ; SSE2: # BB#0: -; SSE2-NEXT: subq $152, %rsp -; SSE2-NEXT: .Lcfi0: -; SSE2-NEXT: .cfi_def_cfa_offset 160 -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm4 -; SSE2-NEXT: movdqa 32(%rdi), %xmm5 -; SSE2-NEXT: movdqa 48(%rdi), %xmm6 +; SSE2-NEXT: movdqa (%rdi), %xmm6 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa (%rsi), %xmm5 +; SSE2-NEXT: movdqa 16(%rsi), %xmm13 +; SSE2-NEXT: movdqa 32(%rsi), %xmm11 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm6, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm2, %xmm15 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm15, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm10, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm10 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm3 +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm5 ; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsi), %xmm14 -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm7, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm13, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm4, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: paddd %xmm14, %xmm12 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm14, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE2-NEXT: movdqa 16(%rsi), %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: paddd %xmm15, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm13, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: paddd %xmm8, %xmm15 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm13 +; SSE2-NEXT: movdqa %xmm11, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm6, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm12, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm5, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: paddd %xmm7, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm11, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE2-NEXT: paddd %xmm2, %xmm14 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: paddd %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movdqa 48(%rsi), %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm1, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa 48(%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm8, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload -; SSE2-NEXT: paddd (%rsp), %xmm11 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: paddd %xmm0, %xmm10 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm0, %xmm12 +; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm15 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: paddd %xmm0, %xmm14 ; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: paddd %xmm0, %xmm12 +; SSE2-NEXT: paddd %xmm0, %xmm14 ; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm15 +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm0, %xmm7 +; SSE2-NEXT: psrld $1, %xmm10 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm10 +; SSE2-NEXT: packuswb %xmm1, %xmm10 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm12 +; SSE2-NEXT: pand %xmm0, %xmm12 +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: packuswb %xmm12, %xmm4 +; SSE2-NEXT: psrld $1, %xmm13 +; SSE2-NEXT: psrld $1, %xmm15 ; SSE2-NEXT: pand %xmm0, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: packuswb %xmm15, %xmm7 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: pand %xmm0, %xmm13 +; SSE2-NEXT: packuswb %xmm15, %xmm13 +; SSE2-NEXT: packuswb %xmm4, %xmm13 +; SSE2-NEXT: psrld $1, %xmm6 ; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pand %xmm0, %xmm14 -; SSE2-NEXT: packuswb %xmm9, %xmm14 -; SSE2-NEXT: packuswb %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm0, %xmm13 ; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: packuswb %xmm13, %xmm6 -; SSE2-NEXT: psrld $1, %xmm12 -; SSE2-NEXT: psrld $1, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm12 -; SSE2-NEXT: packuswb %xmm10, %xmm12 -; SSE2-NEXT: packuswb %xmm6, %xmm12 -; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: packuswb %xmm9, %xmm6 ; SSE2-NEXT: psrld $1, %xmm11 +; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: pand %xmm0, %xmm14 ; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: packuswb %xmm11, %xmm5 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm6, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm5 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: packuswb %xmm5, %xmm4 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: packuswb %xmm14, %xmm11 +; SSE2-NEXT: packuswb %xmm6, %xmm11 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: packuswb %xmm8, %xmm3 +; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm5, %xmm1 -; SSE2-NEXT: packuswb %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: packuswb %xmm5, %xmm7 +; SSE2-NEXT: packuswb %xmm3, %xmm7 +; SSE2-NEXT: movdqu %xmm7, (%rax) +; SSE2-NEXT: movdqu %xmm11, (%rax) +; SSE2-NEXT: movdqu %xmm13, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm12, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) -; SSE2-NEXT: addq $152, %rsp ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v64i8: @@ -464,21 +448,21 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm15, %ymm7, %ymm7 -; AVX2-NEXT: vpaddd %ymm14, %ymm6, %ymm6 -; AVX2-NEXT: vpaddd %ymm13, %ymm5, %ymm5 -; AVX2-NEXT: vpaddd %ymm12, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm11, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm10, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm9, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm9 ; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm10 @@ -540,13 +524,13 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpaddd %zmm7, %zmm3, %zmm3 -; AVX512F-NEXT: vpaddd %zmm6, %zmm2, %zmm2 -; AVX512F-NEXT: vpaddd %zmm5, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm4 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 @@ -673,27 +657,27 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) { define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm4 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm0 @@ -755,80 +739,79 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm10 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm4 +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa 32(%rdi), %xmm10 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm14 +; SSE2-NEXT: movdqa (%rsi), %xmm9 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm8, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm14, %xmm7 +; SSE2-NEXT: movdqa %xmm9, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: paddd %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: paddd %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: paddd %xmm13, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm15, %xmm5 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm14 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm14 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm14 -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: packssdw %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm7, %xmm9 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm3 @@ -837,7 +820,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) +; SSE2-NEXT: movdqu %xmm9, (%rax) ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v32i16: @@ -847,13 +830,13 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 @@ -884,9 +867,9 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 @@ -1047,88 +1030,87 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) { define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm3 +; SSE2-NEXT: movdqa 16(%rdi), %xmm8 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm12 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm11, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm2, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm6, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: paddd %xmm11, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: paddd %xmm10, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm13 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: paddd %xmm14, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm3 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm13 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm7 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm7, %xmm3 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm3 ; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm0 ; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm9 ; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm13 +; SSE2-NEXT: packuswb %xmm9, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm13, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm7, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1512,27 +1494,27 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) { define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm4 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm0 @@ -1594,80 +1576,79 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm10 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm4 +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa 32(%rdi), %xmm10 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm14 +; SSE2-NEXT: movdqa (%rsi), %xmm9 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm8, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm14, %xmm7 +; SSE2-NEXT: movdqa %xmm9, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: paddd %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: paddd %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: paddd %xmm13, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm15, %xmm5 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm14 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm14 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm14 -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: packssdw %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm7, %xmm9 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm3 @@ -1676,7 +1657,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) +; SSE2-NEXT: movdqu %xmm9, (%rax) ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_2: @@ -1686,13 +1667,13 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 @@ -1723,9 +1704,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll index 341dd867e4ff4..647b7a8f4dfca 100644 --- a/test/CodeGen/X86/avx.ll +++ b/test/CodeGen/X86/avx.ll @@ -113,11 +113,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; CHECK-NOT: mov ; CHECK: insertps $48 ; CHECK: insertps $48 +; CHECK: vaddps ; CHECK: insertps $48 ; CHECK: insertps $48 ; CHECK: vaddps ; CHECK: vaddps -; CHECK: vaddps ; CHECK-NEXT: ret %1 = getelementptr inbounds float, float* %fb, i64 %index %2 = load float, float* %1, align 4 diff --git a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll index 63b0281a73399..e29cf09718ad9 100644 --- a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll +++ b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll @@ -13,10 +13,10 @@ define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float ; CHECK: # BB#0: # %entry ; CHECK-NEXT: vcmpgeps %zmm4, %zmm0, %k0 ; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k1 -; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k2 -; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k3 ; CHECK-NEXT: korw %k1, %k0, %k0 -; CHECK-NEXT: korw %k3, %k2, %k1 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k1 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k2 +; CHECK-NEXT: korw %k2, %k1, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index 4890afec2164b..c03623a2f0359 100644 --- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -852,16 +852,16 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %b ; CHECK-NEXT: kxorw %k0, %k0, %k1 ; CHECK-NEXT: vmovaps %zmm1, %zmm3 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: movw $1, %ax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm4 -; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} ; CHECK-NEXT: movw $220, %ax ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0 -; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1 -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4) %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4) diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 32da0a70218e3..431223611faea 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> ; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -30,8 +30,8 @@ define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -51,8 +51,8 @@ define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) @@ -71,8 +71,8 @@ define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) @@ -91,8 +91,8 @@ define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] -; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) @@ -111,8 +111,8 @@ define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) @@ -131,8 +131,8 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x ; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) @@ -671,9 +671,9 @@ define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i6 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) @@ -1616,9 +1616,9 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x ; CHECK-NEXT: vshufpd {{.*#+}} zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) @@ -2031,8 +2031,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3) @@ -2051,8 +2051,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3) @@ -2651,8 +2651,8 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15] ; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> %x2, i16 %x3) @@ -2989,9 +2989,9 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, < ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) @@ -3010,9 +3010,9 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) @@ -3030,9 +3030,9 @@ define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, < ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) @@ -3050,9 +3050,9 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 563cad04b8c2d..b04c1ab38e559 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -479,11 +479,11 @@ declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %rcx -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtsd2usi %xmm0, %rax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4) @@ -498,11 +498,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2si %xmm0, %rcx -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtsd2si %xmm0, %rax +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4) @@ -517,11 +517,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2usi %xmm0, %rcx -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtss2usi %xmm0, %rax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4) @@ -536,11 +536,11 @@ declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2si %xmm0, %rcx -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtss2si %xmm0, %rax +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4) @@ -555,11 +555,11 @@ declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %ecx -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtsd2usi %xmm0, %eax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4) @@ -574,11 +574,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2si %xmm0, %ecx -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtsd2si %xmm0, %eax +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4) @@ -593,11 +593,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2usi %xmm0, %ecx -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtss2usi %xmm0, %eax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4) @@ -612,11 +612,11 @@ declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2si %xmm0, %ecx -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtss2si %xmm0, %eax +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4) @@ -685,8 +685,9 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi) -; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask) @@ -4398,8 +4399,8 @@ define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, < ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vprold $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vprold $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) @@ -4418,8 +4419,8 @@ define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) @@ -4520,9 +4521,9 @@ define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4) %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4) @@ -4543,9 +4544,9 @@ define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vmovapd %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4) %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4) @@ -4612,9 +4613,9 @@ define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, < ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vmovaps %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1} +; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm1 -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4) %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4) diff --git a/test/CodeGen/X86/avx512-mask-spills.ll b/test/CodeGen/X86/avx512-mask-spills.ll index 4ef88ac495c32..96aefdb105845 100644 --- a/test/CodeGen/X86/avx512-mask-spills.ll +++ b/test/CodeGen/X86/avx512-mask-spills.ll @@ -9,13 +9,11 @@ define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: Lcfi0: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2d %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -34,14 +32,12 @@ define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) { ; CHECK-NEXT: Lcfi1: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 +; CHECK-NEXT: korb %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korb %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -60,14 +56,12 @@ define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) { ; CHECK-NEXT: Lcfi2: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -85,14 +79,12 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) { ; CHECK-NEXT: Lcfi3: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 +; CHECK-NEXT: kord %k1, %k0, %k0 ; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill -; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload -; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Reload -; CHECK-NEXT: kord %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %ymm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -106,20 +98,18 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) { define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) { ; CHECK-LABEL: test_64i1: ; CHECK: ## BB#0: -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rax ; CHECK-NEXT: Lcfi4: -; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill -; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kmovq %k0, (%rsp) ## 8-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f -; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload -; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload -; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kmovq (%rsp), %k0 ## 8-byte Reload ; CHECK-NEXT: vpmovm2b %k0, %zmm0 -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq %cmp_res = icmp ugt <64 x i8> %a, %b diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 9b4e73a18fc28..faa055dfbbf3f 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -796,9 +796,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512: @@ -806,9 +806,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1) @@ -826,8 +826,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; @@ -836,8 +836,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1 +; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 51f9a382ccbfd..ca01033bf78ba 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -2159,9 +2159,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512: @@ -2169,9 +2169,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4) %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4) @@ -2411,9 +2411,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512: @@ -2421,9 +2421,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index 7df07b0413ed4..571f345d4616b 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] ; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] ; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1) @@ -29,8 +29,8 @@ define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] ; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) @@ -49,8 +49,8 @@ define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] ; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1) @@ -69,8 +69,8 @@ define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] ; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) @@ -89,8 +89,8 @@ define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0] ; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] ; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] ; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1) @@ -109,8 +109,8 @@ define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] ; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] ; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1) @@ -1476,9 +1476,9 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1] +; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1] -; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) @@ -1496,9 +1496,9 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16> ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1] +; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1] -; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1) @@ -1596,8 +1596,8 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03] -; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca] +; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) @@ -1616,8 +1616,8 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1, ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03] -; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca] +; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll index 8f528394f5bd5..f8f47c87100ad 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll index 37aea45e6107d..96254f7c95b0f 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll @@ -7,8 +7,8 @@ define <4 x i32> @test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32 ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false) diff --git a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll index cf79819734a2d..636358fb91cbd 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -39,8 +39,8 @@ define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3) diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll index 06ee237593e79..d54208c00987c 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -404,8 +404,8 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0, ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 %x3) @@ -424,8 +424,8 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3) diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll index 52a84deebf519..595b3e0ebb863 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -1568,8 +1568,8 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] -; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] ; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] +; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) @@ -1588,9 +1588,9 @@ define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, < ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3] ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb] -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) @@ -1608,9 +1608,9 @@ define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i6 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3] ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4) %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll index ad9ea93c20311..1bfdfd0e634de 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -635,8 +635,8 @@ define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0, ; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0] ; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3) @@ -680,8 +680,8 @@ define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0] -; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) diff --git a/test/CodeGen/X86/avx512ifma-intrinsics.ll b/test/CodeGen/X86/avx512ifma-intrinsics.ll index 30ecc0d2e49e5..9659dc6d455af 100644 --- a/test/CodeGen/X86/avx512ifma-intrinsics.ll +++ b/test/CodeGen/X86/avx512ifma-intrinsics.ll @@ -13,8 +13,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} -; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -41,8 +41,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -69,8 +69,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} -; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -97,8 +97,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll index 3ca686cef3bf4..b2fe6eba88aba 100644 --- a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll +++ b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll @@ -14,8 +14,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -42,8 +42,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -98,8 +98,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -126,8 +126,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -154,8 +154,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -182,8 +182,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -210,8 +210,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 4d906a4fd29a2..c2d8df6476b3e 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -30,8 +30,8 @@ define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] -; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9] +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) @@ -50,8 +50,8 @@ define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] -; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9] +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1) @@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] -; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] ; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1) @@ -90,8 +90,8 @@ define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] ; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) @@ -110,8 +110,8 @@ define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> % ; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] ; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] +; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) @@ -130,8 +130,8 @@ define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> % ; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] ; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9] +; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) @@ -152,9 +152,9 @@ define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0,2,2] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0,2,2] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) @@ -175,9 +175,9 @@ define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) @@ -198,9 +198,9 @@ define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,1,3,3] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,1,3,3] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) @@ -221,9 +221,9 @@ define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) @@ -243,9 +243,9 @@ define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0] +; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0] -; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) @@ -266,9 +266,9 @@ define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2] +; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] ; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2] -; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) @@ -3209,10 +3209,10 @@ define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01] ; CHECK-NEXT: ## xmm2 {%k1} = xmm0[1],xmm1[0] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3] ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1],xmm1[0] -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xcb] -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] +; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1) @@ -3540,9 +3540,9 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1] +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xd3] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xd3,0xc1] -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -3560,9 +3560,9 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xd3,0xc1] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -3580,9 +3580,9 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd2,0xc1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -3600,9 +3600,9 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd2,0xc1] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -3720,8 +3720,8 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0x03] -; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -3740,8 +3740,8 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x73,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0x03] -; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -3760,8 +3760,8 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0x03] -; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -3780,8 +3780,8 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0x03] -; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4642,10 +4642,10 @@ define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02] ; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3],xmm0[0,1] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3] ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4) %res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1) @@ -4817,9 +4817,9 @@ define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb] -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) @@ -4837,9 +4837,9 @@ define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i3 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4) diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 1f324d6795649..684b0468cf518 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4368,8 +4368,8 @@ define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03] -; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -4388,8 +4388,8 @@ define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03] -; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4408,8 +4408,8 @@ define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03] -; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -4428,8 +4428,8 @@ define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03] -; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -4528,8 +4528,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03] -; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -4548,8 +4548,8 @@ define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03] -; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4568,8 +4568,8 @@ define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03] -; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -4588,8 +4588,8 @@ define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03] -; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -4690,9 +4690,9 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05] ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04] +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xdc] ; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xcc] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4) @@ -4732,9 +4732,9 @@ define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04] ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05] +; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdc] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcc] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4) @@ -4755,9 +4755,9 @@ define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0, ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] ; CHECK-NEXT: vmovapd %ymm0, %ymm5 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8] ; CHECK-NEXT: vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04] +; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdd] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcd] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4) diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll index a681c3b0aa429..092b139fca2f9 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -6,68 +6,35 @@ ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=AVX512 define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) { -; SSE2-SSSE3-LABEL: v8i16: -; SSE2-SSSE3: ## BB#0: -; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pextrw $7, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $6, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $5, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $4, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $3, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $2, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $1, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movd %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: retq +; SSE2-LABEL: v8i16: +; SSE2: ## BB#0: +; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i16: +; SSSE3: ## BB#0: +; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm2, %eax +; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: retq ; ; AVX12-LABEL: v8i16: ; AVX12: ## BB#0: ; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrw $7, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $6, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $5, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $4, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i16: @@ -90,22 +57,8 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movd %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i32: @@ -113,19 +66,8 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrd $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i32: @@ -149,22 +91,8 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) ; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: cmpltps %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: andps %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movd %xmm3, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm3, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4f32: @@ -172,19 +100,8 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) ; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 ; AVX12-NEXT: vcmpltps %xmm2, %xmm3, %xmm1 ; AVX12-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrd $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4f32: @@ -208,56 +125,8 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-SSSE3-NEXT: andb $1, %cl -; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax +; SSE2-SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v16i8: @@ -265,55 +134,8 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrb $15, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $14, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $13, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $12, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $11, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $10, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $9, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $8, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $7, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $6, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $5, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $4, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $3, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $2, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $1, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrb $0, %xmm0, %eax -; AVX12-NEXT: andb $1, %al -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v16i8: @@ -383,14 +205,8 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i8: @@ -405,26 +221,21 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i8: @@ -439,26 +250,21 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) { ; AVX2-NEXT: vpsrad $24, %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] ; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i8: @@ -537,14 +343,8 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i16: @@ -559,26 +359,21 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i16: @@ -593,26 +388,21 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) { ; AVX2-NEXT: vpsrad $16, %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] ; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i16: @@ -683,14 +473,8 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i32: @@ -703,24 +487,19 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i32: @@ -733,24 +512,19 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) { ; AVX2-NEXT: vpsrad $31, %xmm2, %xmm4 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] ; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: retq ; ; AVX512-LABEL: v2i32: @@ -801,14 +575,8 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v2i64: @@ -816,13 +584,8 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) { ; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrq $1, %xmm0, %rax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovq %xmm0, %rax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v2i64: @@ -846,14 +609,8 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> ; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: cmpltpd %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: andpd %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movq %xmm3, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm3, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v2f64: @@ -861,13 +618,8 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> ; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 ; AVX12-NEXT: vcmpltpd %xmm2, %xmm3, %xmm1 ; AVX12-NEXT: vandpd %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrq $1, %xmm0, %rax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovq %xmm0, %rax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v2f64: @@ -892,29 +644,15 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; SSE2-SSSE3-NEXT: psrad $24, %xmm3 ; SSE2-SSSE3-NEXT: pslld $24, %xmm2 ; SSE2-SSSE3-NEXT: psrad $24, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pslld $24, %xmm1 ; SSE2-SSSE3-NEXT: psrad $24, %xmm1 ; SSE2-SSSE3-NEXT: pslld $24, %xmm0 ; SSE2-SSSE3-NEXT: psrad $24, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movd %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i8: @@ -923,26 +661,15 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; AVX12-NEXT: vpsrad $24, %xmm3, %xmm3 ; AVX12-NEXT: vpslld $24, %xmm2, %xmm2 ; AVX12-NEXT: vpsrad $24, %xmm2, %xmm2 +; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 ; AVX12-NEXT: vpslld $24, %xmm1, %xmm1 ; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1 ; AVX12-NEXT: vpslld $24, %xmm0, %xmm0 ; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 -; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrd $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i8: @@ -975,29 +702,15 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { ; SSE2-SSSE3-NEXT: psrad $16, %xmm3 ; SSE2-SSSE3-NEXT: pslld $16, %xmm2 ; SSE2-SSSE3-NEXT: psrad $16, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pslld $16, %xmm1 ; SSE2-SSSE3-NEXT: psrad $16, %xmm1 ; SSE2-SSSE3-NEXT: pslld $16, %xmm0 ; SSE2-SSSE3-NEXT: psrad $16, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movd %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i16: @@ -1006,26 +719,15 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { ; AVX12-NEXT: vpsrad $16, %xmm3, %xmm3 ; AVX12-NEXT: vpslld $16, %xmm2, %xmm2 ; AVX12-NEXT: vpsrad $16, %xmm2, %xmm2 +; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 ; AVX12-NEXT: vpslld $16, %xmm1, %xmm1 ; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX12-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 -; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrd $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrd $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i16: @@ -1052,45 +754,42 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { } define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { -; SSE2-SSSE3-LABEL: v8i8: -; SSE2-SSSE3: ## BB#0: -; SSE2-SSSE3-NEXT: psllw $8, %xmm3 -; SSE2-SSSE3-NEXT: psraw $8, %xmm3 -; SSE2-SSSE3-NEXT: psllw $8, %xmm2 -; SSE2-SSSE3-NEXT: psraw $8, %xmm2 -; SSE2-SSSE3-NEXT: psllw $8, %xmm1 -; SSE2-SSSE3-NEXT: psraw $8, %xmm1 -; SSE2-SSSE3-NEXT: psllw $8, %xmm0 -; SSE2-SSSE3-NEXT: psraw $8, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pextrw $7, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $6, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $5, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $4, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $3, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $2, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $1, %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movd %xmm2, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: retq +; SSE2-LABEL: v8i8: +; SSE2: ## BB#0: +; SSE2-NEXT: psllw $8, %xmm3 +; SSE2-NEXT: psraw $8, %xmm3 +; SSE2-NEXT: psllw $8, %xmm2 +; SSE2-NEXT: psraw $8, %xmm2 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-NEXT: psllw $8, %xmm1 +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i8: +; SSSE3: ## BB#0: +; SSSE3-NEXT: psllw $8, %xmm3 +; SSSE3-NEXT: psraw $8, %xmm3 +; SSSE3-NEXT: psllw $8, %xmm2 +; SSSE3-NEXT: psraw $8, %xmm2 +; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 +; SSSE3-NEXT: psllw $8, %xmm1 +; SSSE3-NEXT: psraw $8, %xmm1 +; SSSE3-NEXT: psllw $8, %xmm0 +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: retq ; ; AVX12-LABEL: v8i8: ; AVX12: ## BB#0: @@ -1098,38 +797,16 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; AVX12-NEXT: vpsraw $8, %xmm3, %xmm3 ; AVX12-NEXT: vpsllw $8, %xmm2, %xmm2 ; AVX12-NEXT: vpsraw $8, %xmm2, %xmm2 +; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2 ; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1 ; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1 ; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0 ; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1 -; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpextrw $7, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $6, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $5, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $4, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $3, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $2, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vpextrw $1, %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: vmovd %xmm0, %eax -; AVX12-NEXT: andl $1, %eax -; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX12-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i8: diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll index 06b1a76f6baed..a6d6ca155302e 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-256.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -1,8 +1,83 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX2 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+SSE2 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+SSSE3 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=AVX12,AVX1 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=AVX12,AVX2 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefix=AVX512 define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { +; SSE2-SSSE3-LABEL: v4i64: +; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0] +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm9 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm10, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-SSSE3-NEXT: pslld $31, %xmm0 +; SSE2-SSSE3-NEXT: psrad $31, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm7 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-SSSE3-NEXT: pslld $31, %xmm2 +; SSE2-SSSE3-NEXT: psrad $31, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: v4i64: +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovmskps %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v4i64: ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 @@ -12,19 +87,8 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovmskps %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -45,30 +109,36 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { } define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { -; AVX2-LABEL: v4f64: -; AVX2: ## BB#0: -; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vcmpltpd %ymm2, %ymm3, %ymm1 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; SSE2-SSSE3-LABEL: v4f64: +; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3-NEXT: cmpltpd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-SSSE3-NEXT: pslld $31, %xmm2 +; SSE2-SSSE3-NEXT: psrad $31, %xmm2 +; SSE2-SSSE3-NEXT: cmpltpd %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: cmpltpd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] +; SSE2-SSSE3-NEXT: pslld $31, %xmm6 +; SSE2-SSSE3-NEXT: psrad $31, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm6 +; SSE2-SSSE3-NEXT: movmskps %xmm6, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-SSSE3-NEXT: retq +; +; AVX12-LABEL: v4f64: +; AVX12: ## BB#0: +; AVX12-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vcmpltpd %ymm2, %ymm3, %ymm1 +; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: vzeroupper +; AVX12-NEXT: retq ; ; AVX512-LABEL: v4f64: ; AVX512: ## BB#0: @@ -87,6 +157,78 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> } define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { +; SSE2-LABEL: v16i16: +; SSE2: ## BB#0: +; SSE2-NEXT: pcmpgtw %xmm3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtw %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: packuswb %xmm5, %xmm4 +; SSE2-NEXT: psllw $7, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtb %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i16: +; SSSE3: ## BB#0: +; SSSE3-NEXT: pcmpgtw %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtw %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm3, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: psllw $7, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtb %xmm0, %xmm1 +; SSSE3-NEXT: pcmpgtw %xmm7, %xmm5 +; SSSE3-NEXT: pshufb %xmm3, %xmm5 +; SSSE3-NEXT: pcmpgtw %xmm6, %xmm4 +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSSE3-NEXT: psllw $7, %xmm4 +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: pcmpgtb %xmm4, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: pmovmskb %xmm2, %eax +; SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; SSSE3-NEXT: retq +; +; AVX1-LABEL: v16i16: +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v16i16: ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 @@ -96,55 +238,8 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -164,6 +259,79 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { } define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { +; SSE2-LABEL: v8i32: +; SSE2: ## BB#0: +; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: psraw $15, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: psllw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8i32: +; SSSE3: ## BB#0: +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm3, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: psllw $15, %xmm0 +; SSSE3-NEXT: psraw $15, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 +; SSSE3-NEXT: pshufb %xmm3, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSSE3-NEXT: psllw $15, %xmm4 +; SSSE3-NEXT: psraw $15, %xmm4 +; SSSE3-NEXT: pand %xmm0, %xmm4 +; SSSE3-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm4, %eax +; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: retq +; +; AVX1-LABEL: v8i32: +; AVX1: ## BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v8i32: ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 @@ -173,31 +341,9 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -217,42 +363,74 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { } define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) { -; AVX2-LABEL: v8f32: -; AVX2: ## BB#0: -; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vcmpltps %ymm2, %ymm3, %ymm1 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; SSE2-LABEL: v8f32: +; SSE2: ## BB#0: +; SSE2-NEXT: cmpltps %xmm1, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: cmpltps %xmm0, %xmm2 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: psraw $15, %xmm0 +; SSE2-NEXT: cmpltps %xmm5, %xmm7 +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: cmpltps %xmm4, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: psllw $15, %xmm2 +; SSE2-NEXT: psraw $15, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v8f32: +; SSSE3: ## BB#0: +; SSSE3-NEXT: cmpltps %xmm1, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm1, %xmm3 +; SSSE3-NEXT: cmpltps %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: psllw $15, %xmm2 +; SSSE3-NEXT: psraw $15, %xmm2 +; SSSE3-NEXT: cmpltps %xmm5, %xmm7 +; SSSE3-NEXT: pshufb %xmm1, %xmm7 +; SSSE3-NEXT: cmpltps %xmm4, %xmm6 +; SSSE3-NEXT: pshufb %xmm1, %xmm6 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSSE3-NEXT: psllw $15, %xmm6 +; SSSE3-NEXT: psraw $15, %xmm6 +; SSSE3-NEXT: pand %xmm2, %xmm6 +; SSSE3-NEXT: pshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm6, %eax +; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: retq +; +; AVX12-LABEL: v8f32: +; AVX12: ## BB#0: +; AVX12-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vcmpltps %ymm2, %ymm3, %ymm1 +; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: vzeroupper +; AVX12-NEXT: retq ; ; AVX512-LABEL: v8f32: ; AVX512: ## BB#0: @@ -270,121 +448,250 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) } define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { +; SSE2-SSSE3-LABEL: v32i8: +; SSE2-SSSE3: ## BB#0: +; SSE2-SSSE3-NEXT: pcmpgtb %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtb %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtb %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-SSSE3-NEXT: andb $1, %cl +; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-SSSE3-NEXT: andb $1, %cl +; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: andb $1, %al +; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; SSE2-SSSE3-NEXT: shll $16, %ecx +; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: orl %ecx, %eax +; SSE2-SSSE3-NEXT: retq +; +; AVX1-LABEL: v32i8: +; AVX1: ## BB#0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: Lcfi0: +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: Lcfi1: +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: Lcfi2: +; AVX1-NEXT: .cfi_def_cfa_register %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $32, %rsp +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrb $15, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $14, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $13, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $11, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $9, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $7, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $5, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $3, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $1, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm1, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: movb %al, (%rsp) +; AVX1-NEXT: movl (%rsp), %eax +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: v32i8: ; AVX2: ## BB#0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: Lcfi0: -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: Lcfi1: -; AVX2-NEXT: .cfi_offset %rbp, -16 -; AVX2-NEXT: movq %rsp, %rbp -; AVX2-NEXT: Lcfi2: -; AVX2-NEXT: .cfi_def_cfa_register %rbp -; AVX2-NEXT: andq $-32, %rsp -; AVX2-NEXT: subq $32, %rsp ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $10, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $6, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $2, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $0, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: movl (%rsp), %eax -; AVX2-NEXT: movq %rbp, %rsp -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll index d1508f99fc71e..9bf7b41a4f26a 100644 --- a/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-setcc-128.ll @@ -1,69 +1,35 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSE2 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+ssse3 < %s | FileCheck %s --check-prefixes=CHECK,SSE2-SSSE3,SSSE3 -; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,AVX1 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=CHECK,AVX12,AVX1 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=CHECK,AVX12,AVX2 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=CHECK,AVX512 define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) { -; SSE2-SSSE3-LABEL: v8i16: -; SSE2-SSSE3: ## BB#0: -; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pextrw $7, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $6, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $5, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $4, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $3, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $2, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $1, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: retq +; SSE2-LABEL: v8i16: +; SSE2: ## BB#0: +; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: retq ; -; AVX1-LABEL: v8i16: -; AVX1: ## BB#0: -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; SSSE3-LABEL: v8i16: +; SSSE3: ## BB#0: +; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: retq +; +; AVX12-LABEL: v8i16: +; AVX12: ## BB#0: +; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i16: ; AVX512: ## BB#0: @@ -80,41 +46,16 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) { ; SSE2-SSSE3-LABEL: v4i32: ; SSE2-SSSE3: ## BB#0: ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v4i32: -; AVX1: ## BB#0: -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v4i32: +; AVX12: ## BB#0: +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i32: ; AVX512: ## BB#0: @@ -132,42 +73,16 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b) { ; SSE2-SSSE3-LABEL: v4f32: ; SSE2-SSSE3: ## BB#0: ; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movaps %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v4f32: -; AVX1: ## BB#0: -; AVX1-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vextractps $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vextractps $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vextractps $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vextractps $0, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v4f32: +; AVX12: ## BB#0: +; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v4f32: ; AVX512: ## BB#0: @@ -185,111 +100,16 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) { ; SSE2-SSSE3-LABEL: v16i8: ; SSE2-SSSE3: ## BB#0: ; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl -; SSE2-SSSE3-NEXT: andb $1, %cl -; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: andb $1, %al -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v16i8: -; AVX1: ## BB#0: -; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $15, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $14, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $13, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $12, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $11, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $10, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $9, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $8, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $7, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $6, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $5, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $4, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $3, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $2, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $1, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax -; AVX1-NEXT: retq +; AVX12-LABEL: v16i8: +; AVX12: ## BB#0: +; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v16i8: ; AVX512: ## BB#0: @@ -330,14 +150,8 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i8: @@ -353,15 +167,27 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; +; AVX2-LABEL: v2i8: +; AVX2: ## BB#0: +; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: retq +; ; AVX512-LABEL: v2i8: ; AVX512: ## BB#0: ; AVX512-NEXT: vpsllq $56, %xmm1, %xmm1 @@ -406,14 +232,8 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i16: @@ -429,15 +249,27 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; +; AVX2-LABEL: v2i16: +; AVX2: ## BB#0: +; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: retq +; ; AVX512-LABEL: v2i16: ; AVX512: ## BB#0: ; AVX512-NEXT: vpsllq $48, %xmm1, %xmm1 @@ -478,14 +310,8 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v2i32: @@ -499,15 +325,25 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) { ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX1-NEXT: vmovmskpd %xmm0, %eax +; AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX1-NEXT: retq ; +; AVX2-LABEL: v2i32: +; AVX2: ## BB#0: +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovmskpd %xmm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX2-NEXT: retq +; ; AVX512-LABEL: v2i32: ; AVX512: ## BB#0: ; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 @@ -538,27 +374,16 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) { ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2i64: -; AVX1: ## BB#0: -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v2i64: +; AVX12: ## BB#0: +; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v2i64: ; AVX512: ## BB#0: @@ -576,27 +401,16 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b) { ; SSE2-SSSE3-LABEL: v2f64: ; SSE2-SSSE3: ## BB#0: ; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-SSSE3-NEXT: movq %xmm0, %rax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v2f64: -; AVX1: ## BB#0: -; AVX1-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v2f64: +; AVX12: ## BB#0: +; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 +; AVX12-NEXT: vmovmskpd %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v2f64: ; AVX512: ## BB#0: @@ -618,45 +432,20 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) { ; SSE2-SSSE3-NEXT: pslld $24, %xmm0 ; SSE2-SSSE3-NEXT: psrad $24, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v4i8: -; AVX1: ## BB#0: -; AVX1-NEXT: vpslld $24, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v4i8: +; AVX12: ## BB#0: +; AVX12-NEXT: vpslld $24, %xmm1, %xmm1 +; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1 +; AVX12-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i8: ; AVX512: ## BB#0: @@ -682,45 +471,20 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) { ; SSE2-SSSE3-NEXT: pslld $16, %xmm0 ; SSE2-SSSE3-NEXT: psrad $16, %xmm0 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-SSSE3-NEXT: movd %xmm1, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; SSE2-SSSE3-NEXT: retq ; -; AVX1-LABEL: v4i16: -; AVX1: ## BB#0: -; AVX1-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; AVX12-LABEL: v4i16: +; AVX12: ## BB#0: +; AVX12-NEXT: vpslld $16, %xmm1, %xmm1 +; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1 +; AVX12-NEXT: vpslld $16, %xmm0, %xmm0 +; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vmovmskps %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v4i16: ; AVX512: ## BB#0: @@ -739,73 +503,42 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) { } define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) { -; SSE2-SSSE3-LABEL: v8i8: -; SSE2-SSSE3: ## BB#0: -; SSE2-SSSE3-NEXT: psllw $8, %xmm1 -; SSE2-SSSE3-NEXT: psraw $8, %xmm1 -; SSE2-SSSE3-NEXT: psllw $8, %xmm0 -; SSE2-SSSE3-NEXT: psraw $8, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pextrw $7, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $6, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $5, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $4, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $3, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $2, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: pextrw $1, %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movd %xmm0, %eax -; SSE2-SSSE3-NEXT: andl $1, %eax -; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al -; SSE2-SSSE3-NEXT: retq +; SSE2-LABEL: v8i8: +; SSE2: ## BB#0: +; SSE2-NEXT: psllw $8, %xmm1 +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSE2-NEXT: retq ; -; AVX1-LABEL: v8i8: -; AVX1: ## BB#0: -; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movb -{{[0-9]+}}(%rsp), %al -; AVX1-NEXT: retq +; SSSE3-LABEL: v8i8: +; SSSE3: ## BB#0: +; SSSE3-NEXT: psllw $8, %xmm1 +; SSSE3-NEXT: psraw $8, %xmm1 +; SSSE3-NEXT: psllw $8, %xmm0 +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSSE3-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; SSSE3-NEXT: retq +; +; AVX12-LABEL: v8i8: +; AVX12: ## BB#0: +; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1 +; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> +; AVX12-NEXT: retq ; ; AVX512-LABEL: v8i8: ; AVX512: ## BB#0: diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll index 51c6ad7c7f9ef..b2c619c48d4d3 100644 --- a/test/CodeGen/X86/bitcast-setcc-256.ll +++ b/test/CodeGen/X86/bitcast-setcc-256.ll @@ -8,55 +8,8 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -76,33 +29,8 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b) { ; AVX2-LABEL: v8i32: ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovmskps %ymm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -122,33 +50,8 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b) { ; AVX2-LABEL: v8f32: ; AVX2: ## BB#0: ; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovmskps %ymm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -167,117 +70,8 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b) { define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX2-LABEL: v32i8: ; AVX2: ## BB#0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: Lcfi0: -; AVX2-NEXT: .cfi_def_cfa_offset 16 -; AVX2-NEXT: Lcfi1: -; AVX2-NEXT: .cfi_offset %rbp, -16 -; AVX2-NEXT: movq %rsp, %rbp -; AVX2-NEXT: Lcfi2: -; AVX2-NEXT: .cfi_def_cfa_register %rbp -; AVX2-NEXT: andq $-32, %rsp -; AVX2-NEXT: subq $32, %rsp ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $10, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $6, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $2, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $0, %xmm1, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movb %al, (%rsp) -; AVX2-NEXT: movl (%rsp), %eax -; AVX2-NEXT: movq %rbp, %rsp -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -296,21 +90,8 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) { ; AVX2-LABEL: v4i64: ; AVX2: ## BB#0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovmskpd %ymm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -331,21 +112,8 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b) { ; AVX2-LABEL: v4f64: ; AVX2: ## BB#0: ; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; AVX2-NEXT: vmovmskpd %ymm0, %eax +; AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill> ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/test/CodeGen/X86/bswap_tree2.ll b/test/CodeGen/X86/bswap_tree2.ll index a9c74df9d0d91..1340b7662a7ad 100644 --- a/test/CodeGen/X86/bswap_tree2.ll +++ b/test/CodeGen/X86/bswap_tree2.ll @@ -9,31 +9,32 @@ define i32 @test1(i32 %x) nounwind { ; CHECK-LABEL: test1: ; CHECK: # BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: andl $16711680, %edx # imm = 0xFF0000 -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; CHECK-NEXT: shll $8, %edx -; CHECK-NEXT: shrl $8, %eax -; CHECK-NEXT: bswapl %ecx -; CHECK-NEXT: shrl $16, %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: andl $16711680, %ecx # imm = 0xFF0000 +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: orl $-16777216, %edx # imm = 0xFF000000 +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: shrl $8, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: shrl $16, %eax ; CHECK-NEXT: orl %edx, %eax -; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: test1: ; CHECK64: # BB#0: -; CHECK64-NEXT: movl %edi, %ecx -; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000 ; CHECK64-NEXT: movl %edi, %eax -; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; CHECK64-NEXT: shll $8, %ecx -; CHECK64-NEXT: shrl $8, %eax +; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; CHECK64-NEXT: movl %edi, %ecx +; CHECK64-NEXT: orl $-16777216, %ecx # imm = 0xFF000000 +; CHECK64-NEXT: shll $8, %eax +; CHECK64-NEXT: shrl $8, %ecx +; CHECK64-NEXT: orl %eax, %ecx ; CHECK64-NEXT: bswapl %edi ; CHECK64-NEXT: shrl $16, %edi -; CHECK64-NEXT: orl %ecx, %eax -; CHECK64-NEXT: orl %edi, %eax +; CHECK64-NEXT: orl %ecx, %edi +; CHECK64-NEXT: movl %edi, %eax ; CHECK64-NEXT: retq %byte0 = and i32 %x, 255 ; 0x000000ff %byte1 = and i32 %x, 65280 ; 0x0000ff00 diff --git a/test/CodeGen/X86/eh-unknown.ll b/test/CodeGen/X86/eh-unknown.ll new file mode 100644 index 0000000000000..7c495bdadc676 --- /dev/null +++ b/test/CodeGen/X86/eh-unknown.ll @@ -0,0 +1,32 @@ +; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s + +; An unknown personality forces us to emit an Itanium LSDA. Make sure that the +; Itanium call site table actually tells the personality to keep unwinding, +; i.e. we have an entry and it says "has no landing pad". + +declare void @throwit() +declare void @__unknown_ehpersonality(...) + +define void @use_unknown_ehpersonality() + personality void (...)* @__unknown_ehpersonality { +entry: + call void @throwit() + unreachable +} + +; CHECK-LABEL: use_unknown_ehpersonality: +; CHECK: .Lfunc_begin0: +; CHECK: .seh_handler __unknown_ehpersonality, @unwind, @except +; CHECK: callq throwit +; CHECK: .Lfunc_end0: +; CHECK: .seh_handlerdata +; CHECK: .Lexception0: +; CHECK: .byte 255 # @LPStart Encoding = omit +; CHECK: .byte 0 # @TType Encoding = absptr +; CHECK: .asciz "\217\200" # @TType base offset +; CHECK: .byte 3 # Call site Encoding = udata4 +; CHECK: .byte 13 # Call site table length +; CHECK: .long .Lfunc_begin0-.Lfunc_begin0 # >> Call Site 1 << +; CHECK: .long .Lfunc_end0-.Lfunc_begin0 # Call between .Lfunc_begin0 and .Lfunc_end0 +; CHECK: .long 0 # has no landing pad +; CHECK: .byte 0 # On action: cleanup diff --git a/test/CodeGen/X86/fmsubadd-combine.ll b/test/CodeGen/X86/fmsubadd-combine.ll index bd8888966cf2c..338a95f6a80cd 100644 --- a/test/CodeGen/X86/fmsubadd-combine.ll +++ b/test/CodeGen/X86/fmsubadd-combine.ll @@ -117,9 +117,9 @@ define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x dou ; FMA3_256-NEXT: vsubpd %ymm5, %ymm1, %ymm2 ; FMA3_256-NEXT: vsubpd %ymm4, %ymm0, %ymm3 ; FMA3_256-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; FMA3_256-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; FMA3_256-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA3_256-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] -; FMA3_256-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; FMA3_256-NEXT: retq ; ; FMA3_512-LABEL: mul_subadd_pd512: @@ -137,9 +137,9 @@ define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x dou ; FMA4-NEXT: vsubpd %ymm5, %ymm1, %ymm2 ; FMA4-NEXT: vsubpd %ymm4, %ymm0, %ymm3 ; FMA4-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; FMA4-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; FMA4-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA4-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] -; FMA4-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; FMA4-NEXT: retq entry: %AB = fmul <8 x double> %A, %B @@ -157,9 +157,9 @@ define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x fl ; FMA3_256-NEXT: vsubps %ymm5, %ymm1, %ymm2 ; FMA3_256-NEXT: vsubps %ymm4, %ymm0, %ymm3 ; FMA3_256-NEXT: vaddps %ymm5, %ymm1, %ymm1 +; FMA3_256-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; FMA3_256-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA3_256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] -; FMA3_256-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; FMA3_256-NEXT: retq ; ; FMA3_512-LABEL: mul_subadd_ps512: @@ -178,9 +178,9 @@ define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x fl ; FMA4-NEXT: vsubps %ymm5, %ymm1, %ymm2 ; FMA4-NEXT: vsubps %ymm4, %ymm0, %ymm3 ; FMA4-NEXT: vaddps %ymm5, %ymm1, %ymm1 +; FMA4-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; FMA4-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] -; FMA4-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; FMA4-NEXT: retq entry: %AB = fmul <16 x float> %A, %B diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll index d68236e9d250e..eb06eb75a4d70 100644 --- a/test/CodeGen/X86/fold-tied-op.ll +++ b/test/CodeGen/X86/fold-tied-op.ll @@ -6,9 +6,10 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" target triple = "i386--netbsd" ; CHECK-LABEL: fn1 -; CHECK: addl {{.*#+}} 4-byte Folded Reload -; CHECK: imull {{.*#+}} 4-byte Folded Reload -; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: addl {{.*#+}} 4-byte Folded Reload +; CHECK: xorl {{.*#+}} 4-byte Folded Reload +; CHECK: xorl {{.*#+}} 4-byte Folded Reload ; CHECK: retl %struct.XXH_state64_t = type { i32, i32, i64, i64, i64 } diff --git a/test/CodeGen/X86/fp128-i128.ll b/test/CodeGen/X86/fp128-i128.ll index 98082ec611d49..6c6bc8bdc1d13 100644 --- a/test/CodeGen/X86/fp128-i128.ll +++ b/test/CodeGen/X86/fp128-i128.ll @@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64 %n) #0 { ; CHECK-NEXT: andq %rdi, %rcx ; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000 ; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: jmp foo # TAILCALL diff --git a/test/CodeGen/X86/gnu-seh-nolpads.ll b/test/CodeGen/X86/gnu-seh-nolpads.ll new file mode 100644 index 0000000000000..311f4d522b1df --- /dev/null +++ b/test/CodeGen/X86/gnu-seh-nolpads.ll @@ -0,0 +1,34 @@ +; RUN: llc -mtriple=x86_64-windows-gnu < %s | FileCheck %s + +declare void @throwit() +declare void @__gxx_personality_seh0(...) +declare void @__gcc_personality_seh0(...) + +define void @use_gxx_seh() + personality void (...)* @__gxx_personality_seh0 { +entry: + call void @throwit() + unreachable +} + +; CHECK-LABEL: use_gxx_seh: +; CHECK: .seh_proc use_gxx_seh +; CHECK-NOT: .seh_handler __gxx_personality_seh0 +; CHECK: callq throwit +; CHECK: .seh_handlerdata +; CHECK: .seh_endproc + +define void @use_gcc_seh() + personality void (...)* @__gcc_personality_seh0 { +entry: + call void @throwit() + unreachable +} + +; CHECK-LABEL: use_gcc_seh: +; CHECK: .seh_proc use_gcc_seh +; CHECK-NOT: .seh_handler __gcc_personality_seh0 +; CHECK: callq throwit +; CHECK: .seh_handlerdata +; CHECK: .seh_endproc + diff --git a/test/CodeGen/X86/implicit-null-checks.mir b/test/CodeGen/X86/implicit-null-checks.mir index d0ba057fa009c..b05c4467d3098 100644 --- a/test/CodeGen/X86/implicit-null-checks.mir +++ b/test/CodeGen/X86/implicit-null-checks.mir @@ -379,7 +379,7 @@ liveins: - { reg: '%esi' } # CHECK: bb.0.entry: # CHECK: %eax = MOV32ri 2200000 -# CHECK-NEXT: %eax = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x) +# CHECK-NEXT: %eax = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, %eax, %rdi, 1, _, 0, _, implicit-def %eflags :: (load 4 from %ir.x) # CHECK-NEXT: JMP_1 %bb.1.not_null body: | @@ -544,7 +544,7 @@ liveins: - { reg: '%rsi' } # CHECK: bb.0.entry: # CHECK: %rbx = MOV64rr %rdx -# CHECK-NEXT: %rdi = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, killed %rbx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x) +# CHECK-NEXT: %rdi = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, %rbx, %rdi, 1, _, 0, _, implicit-def %eflags :: (load 4 from %ir.x) body: | bb.0.entry: @@ -656,7 +656,7 @@ body: | name: use_alternate_load_op # CHECK-LABEL: name: use_alternate_load_op # CHECK: bb.0.entry: -# CHECK: %rax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _ +# CHECK: %rax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _ # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -689,7 +689,7 @@ body: | name: imp_null_check_gep_load_with_use_dep # CHECK-LABEL: name: imp_null_check_gep_load_with_use_dep # CHECK: bb.0.entry: -# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _, implicit-def %rax :: (load 4 from %ir.x) +# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _, implicit-def %rax :: (load 4 from %ir.x) # CHECK-NEXT: JMP_1 %bb.1.not_null alignment: 4 tracksRegLiveness: true @@ -721,7 +721,7 @@ name: imp_null_check_load_with_base_sep # CHECK-LABEL: name: imp_null_check_load_with_base_sep # CHECK: bb.0.entry: # CHECK: %rsi = ADD64rr %rsi, %rdi, implicit-def dead %eflags -# CHECK-NEXT: %esi = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %esi, %rdi, 1, _, 0, _, implicit-def dead %eflags +# CHECK-NEXT: %esi = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %esi, %rdi, 1, _, 0, _, implicit-def %eflags # CHECK-NEXT: JMP_1 %bb.1.not_null alignment: 4 tracksRegLiveness: true @@ -752,7 +752,7 @@ body: | name: inc_store # CHECK-LABEL: name: inc_store # CHECK: bb.0.entry: -# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _, killed %rsi +# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _, %rsi # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -782,7 +782,7 @@ body: | name: inc_store_plus_offset # CHECK-LABEL: inc_store_plus_offset # CHECK: bb.0.entry: -# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 16, _, killed %rsi +# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 16, _, %rsi # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -813,7 +813,7 @@ name: inc_store_with_dep # CHECK-LABEL: inc_store_with_dep # CHECK: bb.0.entry: # CHECK: %esi = ADD32rr killed %esi, killed %esi, implicit-def dead %eflags -# CHECK-NEXT: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 16, _, killed %esi +# CHECK-NEXT: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 16, _, %esi # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -972,7 +972,7 @@ body: | name: inc_store_with_reused_base # CHECK-LABEL: inc_store_with_reused_base # CHECK: bb.0.entry: -# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 16, _, killed %esi +# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 16, _, %esi # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -1174,7 +1174,7 @@ body: | name: inc_store_with_load_and_store # CHECK-LABEL: inc_store_with_load_and_store # CHECK: bb.0.entry: -# CHECK: _ = FAULTING_OP 2, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _, killed %esi, implicit-def dead %eflags +# CHECK: _ = FAULTING_OP 2, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _, %esi, implicit-def %eflags # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null @@ -1205,7 +1205,7 @@ body: | name: inc_store_and_load_no_alias # CHECK-LABEL: inc_store_and_load_no_alias # CHECK: bb.0.entry: -# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr) +# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr) # CHECK-NEXT: JMP_1 %bb.1.not_null # CHECK: bb.1.not_null diff --git a/test/CodeGen/X86/lrshrink.ll b/test/CodeGen/X86/lrshrink.ll new file mode 100644 index 0000000000000..a9cf086dbd900 --- /dev/null +++ b/test/CodeGen/X86/lrshrink.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +; Checks if "%7 = add nuw nsw i64 %4, %2" is moved before the last call +; to minimize live-range. + +define i64 @test(i1 %a, i64 %r1, i64 %r2, i64 %s1, i64 %s2, i64 %t1, i64 %t2) { +entry: + br i1 %a, label %then, label %else + +then: + br label %else + +else: + %0 = phi i64 [ 4, %entry ], [ 10, %then ] + %r = phi i64 [ %r1, %entry ], [ %r2, %then ] + %s = phi i64 [ %s1, %entry ], [ %s2, %then ] + %t = phi i64 [ %t1, %entry ], [ %t2, %then ] +; CHECK-LABEL: test: +; CHECK: add +; CHECK: add +; CHECK: call +; CHECK: add +; CHECK: call +; CHECK: add +; CHECK: call +; CHECK: add + %1 = tail call i32 @_Z3foov() + %2 = zext i32 %1 to i64 + %3 = tail call i32 @_Z3foov() + %4 = zext i32 %3 to i64 + %5 = tail call i32 @_Z3foov() + %6 = zext i32 %5 to i64 + %7 = add nuw nsw i64 %0, %r + tail call void @llvm.dbg.value(metadata i64 %7, i64 0, metadata !5, metadata !DIExpression()), !dbg !6 + %8 = add nuw nsw i64 %2, %7 + %9 = add nuw nsw i64 %4, %8 + %10 = add nuw nsw i64 %6, %9 + %11 = add nuw nsw i64 %s, %t + tail call void @llvm.dbg.value(metadata i64 %11, i64 0, metadata !5, metadata !DIExpression()), !dbg !6 + %12 = add nuw nsw i64 %10, %11 + ret i64 %12 +} + +declare i32 @_Z3foov() +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!1, !2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, emissionKind: FullDebug) +!1 = !{i32 2, !"Dwarf Version", i32 4} +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !DIFile(filename: "a.c", directory: "./") +!4 = distinct !DISubprogram(name: "test", scope: !3, unit: !0) +!5 = !DILocalVariable(name: "x", scope: !4) +!6 = !DILocation(line: 4, scope: !4) diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll index d332b2f3169f0..af86df5100165 100644 --- a/test/CodeGen/X86/madd.ll +++ b/test/CodeGen/X86/madd.ll @@ -129,9 +129,9 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly ; SSE2-NEXT: pmullw %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: addq $16, %rsi ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: addq $-8, %rax @@ -246,23 +246,23 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3 ; SSE2-NEXT: pmullw %xmm4, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: movq {{.*#+}} xmm6 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: movq {{.*#+}} xmm7 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: pmullw %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: paddd %xmm5, %xmm1 ; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm4 +; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm5 +; SSE2-NEXT: pmullw %xmm4, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm2 ; SSE2-NEXT: addq $16, %rsi ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: addq $-16, %rax diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll index e62a1d04dad67..94bbe75702cb8 100644 --- a/test/CodeGen/X86/misched-matrix.ll +++ b/test/CodeGen/X86/misched-matrix.ll @@ -17,9 +17,9 @@ ; ; TOPDOWN-LABEL: %for.body ; TOPDOWN: movl %{{.*}}, ( -; TOPDOWN: imull {{[0-9]*}}( +; TOPDOWN-NOT: imull {{[0-9]*}}( ; TOPDOWN: movl %{{.*}}, 4( -; TOPDOWN: imull {{[0-9]*}}( +; TOPDOWN-NOT: imull {{[0-9]*}}( ; TOPDOWN: movl %{{.*}}, 8( ; TOPDOWN: movl %{{.*}}, 12( ; TOPDOWN-LABEL: %for.end diff --git a/test/CodeGen/X86/mul-constant-i16.ll b/test/CodeGen/X86/mul-constant-i16.ll index 6d2465ddd3a87..e3e2737cf3e62 100644 --- a/test/CodeGen/X86/mul-constant-i16.ll +++ b/test/CodeGen/X86/mul-constant-i16.ll @@ -188,16 +188,13 @@ define i16 @test_mul_by_11(i16 %x) { ; X86-LABEL: test_mul_by_11: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: imull $11, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_11: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: leal (%rdi,%rax,2), %eax +; X64-NEXT: imull $11, %edi, %eax ; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 11 @@ -228,16 +225,13 @@ define i16 @test_mul_by_13(i16 %x) { ; X86-LABEL: test_mul_by_13: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: imull $13, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_13: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,2), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax +; X64-NEXT: imull $13, %edi, %eax ; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 13 @@ -247,19 +241,14 @@ define i16 @test_mul_by_13(i16 %x) { define i16 @test_mul_by_14(i16 %x) { ; X86-LABEL: test_mul_by_14: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %eax -; X86-NEXT: leal (%ecx,%eax,4), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $14, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_14: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,2), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: imull $14, %edi, %eax ; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 14 @@ -349,19 +338,14 @@ define i16 @test_mul_by_19(i16 %x) { ; X86-LABEL: test_mul_by_19: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: shll $2, %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $19, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_19: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: shll $2, %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %eax +; X64-NEXT: imull $19, %edi, %eax +; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 19 ret i16 %mul @@ -391,16 +375,13 @@ define i16 @test_mul_by_21(i16 %x) { ; X86-LABEL: test_mul_by_21: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: imull $21, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_21: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax +; X64-NEXT: imull $21, %edi, %eax ; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 21 @@ -410,19 +391,14 @@ define i16 @test_mul_by_21(i16 %x) { define i16 @test_mul_by_22(i16 %x) { ; X86-LABEL: test_mul_by_22: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,4), %eax -; X86-NEXT: leal (%ecx,%eax,4), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $22, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_22: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: imull $22, %edi, %eax ; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 22 @@ -433,19 +409,14 @@ define i16 @test_mul_by_23(i16 %x) { ; X86-LABEL: test_mul_by_23: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: shll $3, %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $23, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_23: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,2), %eax -; X64-NEXT: shll $3, %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %eax +; X64-NEXT: imull $23, %edi, %eax +; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 23 ret i16 %mul @@ -495,19 +466,14 @@ define i16 @test_mul_by_26(i16 %x) { ; X86-LABEL: test_mul_by_26: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $26, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_26: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %eax +; X64-NEXT: imull $26, %edi, %eax +; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 26 ret i16 %mul @@ -536,19 +502,14 @@ define i16 @test_mul_by_27(i16 %x) { define i16 @test_mul_by_28(i16 %x) { ; X86-LABEL: test_mul_by_28: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $28, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_28: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: imull $28, %edi, %eax ; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 28 @@ -558,21 +519,14 @@ define i16 @test_mul_by_28(i16 %x) { define i16 @test_mul_by_29(i16 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $29, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_29: ; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: imull $29, %edi, %eax ; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 29 @@ -583,22 +537,14 @@ define i16 @test_mul_by_30(i16 %x) { ; X86-LABEL: test_mul_by_30: ; X86: # BB#0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll $5, %ecx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: subl %edx, %eax +; X86-NEXT: imull $30, %eax, %eax ; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X86-NEXT: retl ; ; X64-LABEL: test_mul_by_30: ; X64: # BB#0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $5, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: subl %ecx, %edi -; X64-NEXT: movl %edi, %eax +; X64-NEXT: imull $30, %edi, %eax +; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> ; X64-NEXT: retq %mul = mul nsw i16 %x, 30 ret i16 %mul @@ -641,30 +587,3 @@ define i16 @test_mul_by_32(i16 %x) { %mul = mul nsw i16 %x, 32 ret i16 %mul } - -; (x*9+42)*(x*5+2) -define i16 @test_mul_spec(i16 %x) nounwind { -; X86-LABEL: test_mul_spec: -; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal 42(%eax,%eax,8), %ecx -; X86-NEXT: leal 2(%eax,%eax,4), %eax -; X86-NEXT: imull %ecx, %eax -; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> -; X86-NEXT: retl -; -; X64-LABEL: test_mul_spec: -; X64: # BB#0: -; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-NEXT: leal 42(%rdi,%rdi,8), %ecx -; X64-NEXT: leal 2(%rdi,%rdi,4), %eax -; X64-NEXT: imull %ecx, %eax -; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> -; X64-NEXT: retq - %mul = mul nsw i16 %x, 9 - %add = add nsw i16 %mul, 42 - %mul2 = mul nsw i16 %x, 5 - %add2 = add nsw i16 %mul2, 2 - %mul3 = mul nsw i16 %add, %add2 - ret i16 %mul3 -} diff --git a/test/CodeGen/X86/mul-constant-i32.ll b/test/CodeGen/X86/mul-constant-i32.ll index b1e9a929b7f26..76e46e1f1b09e 100644 --- a/test/CodeGen/X86/mul-constant-i32.ll +++ b/test/CodeGen/X86/mul-constant-i32.ll @@ -1,12 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=X64-JAG -; RUN: llc < %s -mtriple=i686-unknown -mul-constant-optimization=false | FileCheck %s --check-prefix=X86-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=HSW-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=JAG-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=X64-SLM -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=SLM-NOOPT +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64 define i32 @test_mul_by_1(i32 %x) { ; X86-LABEL: test_mul_by_1: @@ -14,40 +8,10 @@ define i32 @test_mul_by_1(i32 %x) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_1: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_1: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_1: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_1: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_1: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_1: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_1: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_1: +; X64: # BB#0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 1 ret i32 %mul } @@ -59,47 +23,11 @@ define i32 @test_mul_by_2(i32 %x) { ; X86-NEXT: addl %eax, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_2: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_2: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_2: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: addl %eax, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_2: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; HSW-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_2: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; JAG-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_2: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (%rdi,%rdi), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_2: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; SLM-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_2: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (%rdi,%rdi), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 2 ret i32 %mul } @@ -110,46 +38,11 @@ define i32 @test_mul_by_3(i32 %x) { ; X86-NEXT: imull $3, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_3: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_3: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_3: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $3, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_3: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; HSW-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_3: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; JAG-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_3: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_3: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; SLM-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_3: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (%rdi,%rdi,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 3 ret i32 %mul } @@ -161,47 +54,11 @@ define i32 @test_mul_by_4(i32 %x) { ; X86-NEXT: shll $2, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_4: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_4: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_4: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: shll $2, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_4: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; HSW-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_4: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; JAG-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_4: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_4: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; SLM-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_4: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (,%rdi,4), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 4 ret i32 %mul } @@ -212,46 +69,11 @@ define i32 @test_mul_by_5(i32 %x) { ; X86-NEXT: imull $5, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_5: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_5: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_5: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $5, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_5: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_5: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; JAG-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_5: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_5: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; SLM-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_5: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 5 ret i32 %mul } @@ -264,46 +86,12 @@ define i32 @test_mul_by_6(i32 %x) { ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_6: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25] -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_6: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_6: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $6, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_6: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $6, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_6: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_6: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_6: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_6: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: addl %edi, %edi +; X64-NEXT: leal (%rdi,%rdi,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 6 ret i32 %mul } @@ -316,46 +104,12 @@ define i32 @test_mul_by_7(i32 %x) { ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_7: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_7: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_7: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $7, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_7: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $7, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_7: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_7: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00] -; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_7: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_7: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (,%rdi,8), %eax +; X64-NEXT: subl %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 7 ret i32 %mul } @@ -367,47 +121,11 @@ define i32 @test_mul_by_8(i32 %x) { ; X86-NEXT: shll $3, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_8: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_8: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_8: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: shll $3, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_8: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; HSW-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_8: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; JAG-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_8: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_8: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; SLM-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_8: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (,%rdi,8), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 8 ret i32 %mul } @@ -418,46 +136,11 @@ define i32 @test_mul_by_9(i32 %x) { ; X86-NEXT: imull $9, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_9: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_9: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_9: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $9, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_9: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; HSW-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_9: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; JAG-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_9: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_9: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; SLM-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_9: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (%rdi,%rdi,8), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 9 ret i32 %mul } @@ -470,46 +153,12 @@ define i32 @test_mul_by_10(i32 %x) { ; X86-NEXT: leal (%eax,%eax,4), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_10: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25] -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_10: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_10: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $10, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_10: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $10, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_10: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_10: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_10: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_10: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: addl %edi, %edi +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 10 ret i32 %mul } @@ -517,49 +166,13 @@ define i32 @test_mul_by_10(i32 %x) { define i32 @test_mul_by_11(i32 %x) { ; X86-LABEL: test_mul_by_11: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: imull $11, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_11: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_11: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_11: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $11, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_11: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $11, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_11: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_11: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $11, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_11: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_11: +; X64: # BB#0: +; X64-NEXT: imull $11, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 11 ret i32 %mul } @@ -572,46 +185,12 @@ define i32 @test_mul_by_12(i32 %x) { ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_12: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_12: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: shll $2, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_12: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $12, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_12: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $12, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_12: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_12: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: shll $2, %edi # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_12: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_12: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: shll $2, %edi +; X64-NEXT: leal (%rdi,%rdi,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 12 ret i32 %mul } @@ -619,49 +198,13 @@ define i32 @test_mul_by_12(i32 %x) { define i32 @test_mul_by_13(i32 %x) { ; X86-LABEL: test_mul_by_13: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: imull $13, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_13: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_13: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_13: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $13, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_13: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $13, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_13: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_13: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $13, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_13: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_13: +; X64: # BB#0: +; X64-NEXT: imull $13, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 13 ret i32 %mul } @@ -669,52 +212,13 @@ define i32 @test_mul_by_13(i32 %x) { define i32 @test_mul_by_14(i32 %x) { ; X86-LABEL: test_mul_by_14: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %eax -; X86-NEXT: leal (%ecx,%eax,4), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: imull $14, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_14: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_14: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_14: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $14, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_14: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $14, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_14: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_14: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $14, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_14: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_14: +; X64: # BB#0: +; X64-NEXT: imull $14, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 14 ret i32 %mul } @@ -727,46 +231,12 @@ define i32 @test_mul_by_15(i32 %x) { ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_15: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_15: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_15: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $15, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_15: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $15, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_15: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_15: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rax,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_15: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_15: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: leal (%rax,%rax,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 15 ret i32 %mul } @@ -778,47 +248,11 @@ define i32 @test_mul_by_16(i32 %x) { ; X86-NEXT: shll $4, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_16: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shll $4, %edi # sched: [1:0.50] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_16: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shll $4, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_16: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: shll $4, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_16: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: shll $4, %edi # sched: [1:0.50] -; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_16: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: shll $4, %edi # sched: [1:0.50] -; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_16: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shll $4, %edi # sched: [1:1.00] -; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_16: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: shll $4, %edi # sched: [1:1.00] -; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_16: +; X64: # BB#0: +; X64-NEXT: shll $4, %edi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 16 ret i32 %mul } @@ -832,49 +266,13 @@ define i32 @test_mul_by_17(i32 %x) { ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_17: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: shll $4, %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_17: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: shll $4, %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_17: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $17, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_17: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $17, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_17: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_17: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: shll $4, %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rdi), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_17: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_17: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll $4, %eax +; X64-NEXT: leal (%rax,%rdi), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 17 ret i32 %mul } @@ -887,46 +285,12 @@ define i32 @test_mul_by_18(i32 %x) { ; X86-NEXT: leal (%eax,%eax,8), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_18: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25] -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_18: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_18: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $18, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_18: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $18, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_18: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_18: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50] -; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_18: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_18: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: addl %edi, %edi +; X64-NEXT: leal (%rdi,%rdi,8), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 18 ret i32 %mul } @@ -934,54 +298,13 @@ define i32 @test_mul_by_18(i32 %x) { define i32 @test_mul_by_19(i32 %x) { ; X86-LABEL: test_mul_by_19: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: shll $2, %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $19, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_19: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: shll $2, %eax # sched: [1:0.50] -; X64-HSW-NEXT: subl %eax, %edi # sched: [1:0.25] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_19: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: shll $2, %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %eax, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_19: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $19, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_19: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $19, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_19: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_19: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $19, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_19: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_19: +; X64: # BB#0: +; X64-NEXT: imull $19, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 19 ret i32 %mul } @@ -994,46 +317,12 @@ define i32 @test_mul_by_20(i32 %x) { ; X86-NEXT: leal (%eax,%eax,4), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_20: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_20: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: shll $2, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_20: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $20, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_20: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $20, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_20: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_20: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: shll $2, %edi # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_20: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_20: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: shll $2, %edi +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 20 ret i32 %mul } @@ -1041,49 +330,13 @@ define i32 @test_mul_by_20(i32 %x) { define i32 @test_mul_by_21(i32 %x) { ; X86-LABEL: test_mul_by_21: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: imull $21, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_21: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_21: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_21: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $21, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_21: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $21, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_21: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_21: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $21, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_21: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_21: +; X64: # BB#0: +; X64-NEXT: imull $21, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 21 ret i32 %mul } @@ -1091,52 +344,13 @@ define i32 @test_mul_by_21(i32 %x) { define i32 @test_mul_by_22(i32 %x) { ; X86-LABEL: test_mul_by_22: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,4), %eax -; X86-NEXT: leal (%ecx,%eax,4), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: imull $22, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_22: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_22: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_22: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $22, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_22: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $22, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_22: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_22: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $22, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_22: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_22: +; X64: # BB#0: +; X64-NEXT: imull $22, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 22 ret i32 %mul } @@ -1144,54 +358,13 @@ define i32 @test_mul_by_22(i32 %x) { define i32 @test_mul_by_23(i32 %x) { ; X86-LABEL: test_mul_by_23: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: shll $3, %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $23, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_23: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: shll $3, %eax # sched: [1:0.50] -; X64-HSW-NEXT: subl %eax, %edi # sched: [1:0.25] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_23: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: shll $3, %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %eax, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_23: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $23, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_23: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $23, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_23: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_23: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $23, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_23: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_23: +; X64: # BB#0: +; X64-NEXT: imull $23, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 23 ret i32 %mul } @@ -1204,46 +377,12 @@ define i32 @test_mul_by_24(i32 %x) { ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_24: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: shll $3, %edi # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_24: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: shll $3, %edi # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_24: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $24, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_24: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $24, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_24: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_24: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: shll $3, %edi # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_24: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_24: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: shll $3, %edi +; X64-NEXT: leal (%rdi,%rdi,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 24 ret i32 %mul } @@ -1256,46 +395,12 @@ define i32 @test_mul_by_25(i32 %x) { ; X86-NEXT: leal (%eax,%eax,4), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_25: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_25: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_25: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $25, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_25: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $25, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_25: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_25: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rax,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_25: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_25: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (%rdi,%rdi,4), %eax +; X64-NEXT: leal (%rax,%rax,4), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 25 ret i32 %mul } @@ -1303,54 +408,13 @@ define i32 @test_mul_by_25(i32 %x) { define i32 @test_mul_by_26(i32 %x) { ; X86-LABEL: test_mul_by_26: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: subl %ecx, %eax +; X86-NEXT: imull $26, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_26: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: subl %eax, %edi # sched: [1:0.25] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_26: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %eax, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_26: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $26, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_26: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $26, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_26: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_26: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $26, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_26: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_26: +; X64: # BB#0: +; X64-NEXT: imull $26, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 26 ret i32 %mul } @@ -1363,46 +427,12 @@ define i32 @test_mul_by_27(i32 %x) { ; X86-NEXT: leal (%eax,%eax,2), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_27: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_27: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_27: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $27, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_27: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $27, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_27: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_27: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00] -; X64-SLM-NEXT: leal (%rax,%rax,2), %eax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_27: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_27: +; X64: # BB#0: +; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> +; X64-NEXT: leal (%rdi,%rdi,8), %eax +; X64-NEXT: leal (%rax,%rax,2), %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 27 ret i32 %mul } @@ -1410,52 +440,13 @@ define i32 @test_mul_by_27(i32 %x) { define i32 @test_mul_by_28(i32 %x) { ; X86-LABEL: test_mul_by_28: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: imull $28, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_28: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_28: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_28: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $28, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_28: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $28, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_28: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_28: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $28, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_28: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_28: +; X64: # BB#0: +; X64-NEXT: imull $28, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 28 ret i32 %mul } @@ -1463,55 +454,13 @@ define i32 @test_mul_by_28(i32 %x) { define i32 @test_mul_by_29(i32 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: imull $29, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_29: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_29: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] -; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_29: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $29, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_29: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $29, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_29: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_29: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $29, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_29: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_29: +; X64: # BB#0: +; X64-NEXT: imull $29, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 29 ret i32 %mul } @@ -1519,58 +468,13 @@ define i32 @test_mul_by_29(i32 %x) { define i32 @test_mul_by_30(i32 %x) { ; X86-LABEL: test_mul_by_30: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll $5, %ecx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: subl %edx, %eax +; X86-NEXT: imull $30, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_30: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50] -; X64-HSW-NEXT: movl %edi, %ecx # sched: [1:0.25] -; X64-HSW-NEXT: subl %eax, %ecx # sched: [1:0.25] -; X64-HSW-NEXT: subl %ecx, %edi # sched: [1:0.25] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_30: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: movl %edi, %ecx # sched: [1:0.17] -; X64-JAG-NEXT: shll $5, %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %eax, %ecx # sched: [1:0.50] -; X64-JAG-NEXT: subl %ecx, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_30: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $30, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_30: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $30, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_30: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_30: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imull $30, %edi, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_30: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_30: +; X64: # BB#0: +; X64-NEXT: imull $30, %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 30 ret i32 %mul } @@ -1584,46 +488,12 @@ define i32 @test_mul_by_31(i32 %x) { ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_31: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50] -; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_31: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: shll $5, %eax # sched: [1:0.50] -; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_31: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: imull $31, {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_31: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imull $31, %edi, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_31: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_31: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: shll $5, %eax # sched: [1:1.00] -; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_31: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_31: +; X64: # BB#0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll $5, %eax +; X64-NEXT: subl %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 31 ret i32 %mul } @@ -1635,124 +505,11 @@ define i32 @test_mul_by_32(i32 %x) { ; X86-NEXT: shll $5, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_32: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shll $5, %edi # sched: [1:0.50] -; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_32: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shll $5, %edi # sched: [1:0.50] -; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_32: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: shll $5, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_32: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: shll $5, %edi # sched: [1:0.50] -; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_32: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: shll $5, %edi # sched: [1:0.50] -; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_32: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shll $5, %edi # sched: [1:1.00] -; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_32: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: shll $5, %edi # sched: [1:1.00] -; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_32: +; X64: # BB#0: +; X64-NEXT: shll $5, %edi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq %mul = mul nsw i32 %x, 32 ret i32 %mul } - -; (x*9+42)*(x*5+2) -define i32 @test_mul_spec(i32 %x) nounwind { -; X86-LABEL: test_mul_spec: -; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal 42(%eax,%eax,8), %ecx -; X86-NEXT: leal 2(%eax,%eax,4), %eax -; X86-NEXT: imull %ecx, %eax -; X86-NEXT: retl -; -; X64-HSW-LABEL: test_mul_spec: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-HSW-NEXT: leal (%rdi,%rdi,8), %ecx # sched: [1:0.50] -; X64-HSW-NEXT: addl $42, %ecx # sched: [1:0.25] -; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl $2, %eax # sched: [1:0.25] -; X64-HSW-NEXT: imull %ecx, %eax # sched: [4:1.00] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_spec: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-JAG-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:0.50] -; X64-JAG-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:0.50] -; X64-JAG-NEXT: imull %ecx, %eax # sched: [3:1.00] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_spec: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: leal 42(%eax,%eax,8), %ecx -; X86-NOOPT-NEXT: leal 2(%eax,%eax,4), %eax -; X86-NOOPT-NEXT: imull %ecx, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_spec: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; HSW-NOOPT-NEXT: leal (%rdi,%rdi,8), %ecx # sched: [1:0.50] -; HSW-NOOPT-NEXT: addl $42, %ecx # sched: [1:0.25] -; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50] -; HSW-NOOPT-NEXT: addl $2, %eax # sched: [1:0.25] -; HSW-NOOPT-NEXT: imull %ecx, %eax # sched: [4:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_spec: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; JAG-NOOPT-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:0.50] -; JAG-NOOPT-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:0.50] -; JAG-NOOPT-NEXT: imull %ecx, %eax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_spec: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; X64-SLM-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00] -; X64-SLM-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00] -; X64-SLM-NEXT: imull %ecx, %eax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_spec: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> -; SLM-NOOPT-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00] -; SLM-NOOPT-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00] -; SLM-NOOPT-NEXT: imull %ecx, %eax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] - %mul = mul nsw i32 %x, 9 - %add = add nsw i32 %mul, 42 - %mul2 = mul nsw i32 %x, 5 - %add2 = add nsw i32 %mul2, 2 - %mul3 = mul nsw i32 %add, %add2 - ret i32 %mul3 -} diff --git a/test/CodeGen/X86/mul-constant-i64.ll b/test/CodeGen/X86/mul-constant-i64.ll index 22eb0bdc6c3f8..8579179a82315 100644 --- a/test/CodeGen/X86/mul-constant-i64.ll +++ b/test/CodeGen/X86/mul-constant-i64.ll @@ -1,55 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=X64-JAG -; RUN: llc < %s -mtriple=i686-unknown -mul-constant-optimization=false | FileCheck %s --check-prefix=X86-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=HSW-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=JAG-NOOPT -; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=X64-SLM -; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=SLM-NOOPT +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64 -define i64 @test_mul_by_1(i64 %x) nounwind { +define i64 @test_mul_by_1(i64 %x) { ; X86-LABEL: test_mul_by_1: ; X86: # BB#0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_1: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_1: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_1: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_1: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_1: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_1: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_1: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_1: +; X64: # BB#0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 1 ret i64 %mul } @@ -63,43 +26,10 @@ define i64 @test_mul_by_2(i64 %x) { ; X86-NEXT: addl %eax, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_2: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_2: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_2: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: shldl $1, %eax, %edx -; X86-NOOPT-NEXT: addl %eax, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_2: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_2: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_2: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_2: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_2: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 2 ret i64 %mul } @@ -113,43 +43,10 @@ define i64 @test_mul_by_3(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_3: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_3: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_3: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $3, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $3, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_3: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_3: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_3: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_3: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_3: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 3 ret i64 %mul } @@ -163,43 +60,10 @@ define i64 @test_mul_by_4(i64 %x) { ; X86-NEXT: shll $2, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_4: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_4: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_4: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: shldl $2, %eax, %edx -; X86-NOOPT-NEXT: shll $2, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_4: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_4: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_4: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_4: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_4: +; X64: # BB#0: +; X64-NEXT: leaq (,%rdi,4), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 4 ret i64 %mul } @@ -213,43 +77,10 @@ define i64 @test_mul_by_5(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_5: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_5: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_5: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $5, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $5, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_5: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_5: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_5: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_5: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_5: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,4), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 5 ret i64 %mul } @@ -264,46 +95,11 @@ define i64 @test_mul_by_6(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,2), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_6: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_6: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_6: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $6, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $6, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_6: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_6: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_6: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_6: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_6: +; X64: # BB#0: +; X64-NEXT: addq %rdi, %rdi +; X64-NEXT: leaq (%rdi,%rdi,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 6 ret i64 %mul } @@ -319,46 +115,11 @@ define i64 @test_mul_by_7(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_7: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_7: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_7: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $7, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $7, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_7: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_7: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_7: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (,%rdi,8), %rax # sched: [1:1.00] -; X64-SLM-NEXT: subq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_7: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_7: +; X64: # BB#0: +; X64-NEXT: leaq (,%rdi,8), %rax +; X64-NEXT: subq %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 7 ret i64 %mul } @@ -372,43 +133,10 @@ define i64 @test_mul_by_8(i64 %x) { ; X86-NEXT: shll $3, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_8: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_8: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_8: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: shldl $3, %eax, %edx -; X86-NOOPT-NEXT: shll $3, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_8: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_8: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_8: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (,%rdi,8), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_8: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_8: +; X64: # BB#0: +; X64-NEXT: leaq (,%rdi,8), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 8 ret i64 %mul } @@ -422,43 +150,10 @@ define i64 @test_mul_by_9(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_9: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_9: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_9: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $9, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $9, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_9: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_9: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_9: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_9: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_9: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,8), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 9 ret i64 %mul } @@ -473,46 +168,11 @@ define i64 @test_mul_by_10(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,2), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_10: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_10: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_10: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $10, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $10, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_10: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_10: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_10: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_10: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_10: +; X64: # BB#0: +; X64-NEXT: addq %rdi, %rdi +; X64-NEXT: leaq (%rdi,%rdi,4), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 10 ret i64 %mul } @@ -520,53 +180,16 @@ define i64 @test_mul_by_10(i64 %x) { define i64 @test_mul_by_11(i64 %x) { ; X86-LABEL: test_mul_by_11: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,2), %ecx ; X86-NEXT: movl $11, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $11, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_11: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_11: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_11: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $11, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $11, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_11: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_11: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_11: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_11: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_11: +; X64: # BB#0: +; X64-NEXT: imulq $11, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 11 ret i64 %mul } @@ -581,46 +204,11 @@ define i64 @test_mul_by_12(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,4), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_12: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_12: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shlq $2, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_12: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $12, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $12, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_12: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_12: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_12: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shlq $2, %rdi # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_12: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_12: +; X64: # BB#0: +; X64-NEXT: shlq $2, %rdi +; X64-NEXT: leaq (%rdi,%rdi,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 12 ret i64 %mul } @@ -628,53 +216,16 @@ define i64 @test_mul_by_12(i64 %x) { define i64 @test_mul_by_13(i64 %x) { ; X86-LABEL: test_mul_by_13: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %ecx ; X86-NEXT: movl $13, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $13, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_13: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_13: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_13: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $13, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $13, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_13: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_13: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_13: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_13: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_13: +; X64: # BB#0: +; X64-NEXT: imulq $13, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 13 ret i64 %mul } @@ -682,56 +233,16 @@ define i64 @test_mul_by_13(i64 %x) { define i64 @test_mul_by_14(i64 %x) { ; X86-LABEL: test_mul_by_14: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %ecx -; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl $14, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $14, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_14: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_14: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_14: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $14, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $14, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_14: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_14: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_14: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_14: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_14: +; X64: # BB#0: +; X64-NEXT: imulq $14, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 14 ret i64 %mul } @@ -747,46 +258,11 @@ define i64 @test_mul_by_15(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_15: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_15: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_15: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $15, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $15, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_15: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_15: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_15: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_15: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_15: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,4), %rax +; X64-NEXT: leaq (%rax,%rax,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 15 ret i64 %mul } @@ -800,49 +276,11 @@ define i64 @test_mul_by_16(i64 %x) { ; X86-NEXT: shll $4, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_16: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shlq $4, %rdi # sched: [1:0.50] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_16: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shlq $4, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_16: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: shldl $4, %eax, %edx -; X86-NOOPT-NEXT: shll $4, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_16: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: shlq $4, %rdi # sched: [1:0.50] -; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_16: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: shlq $4, %rdi # sched: [1:0.50] -; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_16: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shlq $4, %rdi # sched: [1:1.00] -; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_16: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: shlq $4, %rdi # sched: [1:1.00] -; SLM-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_16: +; X64: # BB#0: +; X64-NEXT: shlq $4, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 16 ret i64 %mul } @@ -859,49 +297,12 @@ define i64 @test_mul_by_17(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_17: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: shlq $4, %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_17: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: shlq $4, %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_17: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $17, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $17, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_17: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_17: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_17: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: shlq $4, %rax # sched: [1:1.00] -; X64-SLM-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_17: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_17: +; X64: # BB#0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shlq $4, %rax +; X64-NEXT: leaq (%rax,%rdi), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 17 ret i64 %mul } @@ -916,46 +317,11 @@ define i64 @test_mul_by_18(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,2), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_18: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_18: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_18: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $18, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $18, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_18: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_18: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_18: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50] -; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_18: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_18: +; X64: # BB#0: +; X64-NEXT: addq %rdi, %rdi +; X64-NEXT: leaq (%rdi,%rdi,8), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 18 ret i64 %mul } @@ -963,58 +329,16 @@ define i64 @test_mul_by_18(i64 %x) { define i64 @test_mul_by_19(i64 %x) { ; X86-LABEL: test_mul_by_19: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,4), %eax -; X86-NEXT: shll $2, %eax -; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl $19, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $19, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_19: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: shlq $2, %rax # sched: [1:0.50] -; X64-HSW-NEXT: subq %rax, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_19: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: shlq $2, %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rax, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_19: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $19, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $19, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_19: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_19: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_19: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_19: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_19: +; X64: # BB#0: +; X64-NEXT: imulq $19, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 19 ret i64 %mul } @@ -1029,46 +353,11 @@ define i64 @test_mul_by_20(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,4), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_20: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_20: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shlq $2, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_20: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $20, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $20, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_20: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_20: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_20: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shlq $2, %rdi # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_20: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_20: +; X64: # BB#0: +; X64-NEXT: shlq $2, %rdi +; X64-NEXT: leaq (%rdi,%rdi,4), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 20 ret i64 %mul } @@ -1076,53 +365,16 @@ define i64 @test_mul_by_20(i64 %x) { define i64 @test_mul_by_21(i64 %x) { ; X86-LABEL: test_mul_by_21: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %ecx ; X86-NEXT: movl $21, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $21, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_21: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_21: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_21: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $21, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $21, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_21: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_21: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_21: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_21: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_21: +; X64: # BB#0: +; X64-NEXT: imulq $21, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 21 ret i64 %mul } @@ -1130,56 +382,16 @@ define i64 @test_mul_by_21(i64 %x) { define i64 @test_mul_by_22(i64 %x) { ; X86-LABEL: test_mul_by_22: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,4), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %ecx -; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl $22, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $22, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_22: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_22: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_22: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $22, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $22, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_22: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_22: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_22: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_22: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_22: +; X64: # BB#0: +; X64-NEXT: imulq $22, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 22 ret i64 %mul } @@ -1187,58 +399,16 @@ define i64 @test_mul_by_22(i64 %x) { define i64 @test_mul_by_23(i64 %x) { ; X86-LABEL: test_mul_by_23: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %eax -; X86-NEXT: shll $3, %eax -; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl $23, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $23, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_23: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: shlq $3, %rax # sched: [1:0.50] -; X64-HSW-NEXT: subq %rax, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_23: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: shlq $3, %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rax, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_23: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $23, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $23, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_23: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_23: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_23: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_23: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_23: +; X64: # BB#0: +; X64-NEXT: imulq $23, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 23 ret i64 %mul } @@ -1253,46 +423,11 @@ define i64 @test_mul_by_24(i64 %x) { ; X86-NEXT: leal (%edx,%ecx,8), %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_24: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shlq $3, %rdi # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_24: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shlq $3, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_24: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $24, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $24, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_24: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_24: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_24: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shlq $3, %rdi # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_24: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_24: +; X64: # BB#0: +; X64-NEXT: shlq $3, %rdi +; X64-NEXT: leaq (%rdi,%rdi,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 24 ret i64 %mul } @@ -1308,46 +443,11 @@ define i64 @test_mul_by_25(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_25: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_25: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_25: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $25, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $25, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_25: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_25: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_25: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_25: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_25: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,4), %rax +; X64-NEXT: leaq (%rax,%rax,4), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 25 ret i64 %mul } @@ -1355,58 +455,16 @@ define i64 @test_mul_by_25(i64 %x) { define i64 @test_mul_by_26(i64 %x) { ; X86-LABEL: test_mul_by_26: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl $26, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $26, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_26: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: subq %rax, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_26: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rax, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_26: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $26, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $26, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_26: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_26: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_26: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_26: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_26: +; X64: # BB#0: +; X64-NEXT: imulq $26, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 26 ret i64 %mul } @@ -1422,46 +480,11 @@ define i64 @test_mul_by_27(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_27: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_27: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_27: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $27, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $27, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_27: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_27: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_27: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00] -; X64-SLM-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_27: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_27: +; X64: # BB#0: +; X64-NEXT: leaq (%rdi,%rdi,8), %rax +; X64-NEXT: leaq (%rax,%rax,2), %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 27 ret i64 %mul } @@ -1469,56 +492,16 @@ define i64 @test_mul_by_27(i64 %x) { define i64 @test_mul_by_28(i64 %x) { ; X86-LABEL: test_mul_by_28: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl $28, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $28, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_28: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_28: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_28: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $28, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $28, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_28: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_28: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_28: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_28: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_28: +; X64: # BB#0: +; X64-NEXT: imulq $28, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 28 ret i64 %mul } @@ -1526,59 +509,16 @@ define i64 @test_mul_by_28(i64 %x) { define i64 @test_mul_by_29(i64 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,8), %ecx -; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl $29, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $29, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_29: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_29: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] -; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_29: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $29, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $29, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_29: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_29: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_29: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_29: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_29: +; X64: # BB#0: +; X64-NEXT: imulq $29, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 29 ret i64 %mul } @@ -1586,60 +526,16 @@ define i64 @test_mul_by_29(i64 %x) { define i64 @test_mul_by_30(i64 %x) { ; X86-LABEL: test_mul_by_30: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $5, %ecx ; X86-NEXT: movl $30, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: imull $30, {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_30: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50] -; X64-HSW-NEXT: movq %rdi, %rcx # sched: [1:0.25] -; X64-HSW-NEXT: subq %rax, %rcx # sched: [1:0.25] -; X64-HSW-NEXT: subq %rcx, %rdi # sched: [1:0.25] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_30: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: movq %rdi, %rcx # sched: [1:0.17] -; X64-JAG-NEXT: shlq $5, %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rax, %rcx # sched: [1:0.50] -; X64-JAG-NEXT: subq %rcx, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_30: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $30, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $30, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_30: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_30: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_30: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_30: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_30: +; X64: # BB#0: +; X64-NEXT: imulq $30, %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 30 ret i64 %mul } @@ -1656,49 +552,12 @@ define i64 @test_mul_by_31(i64 %x) { ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_31: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50] -; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_31: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: shlq $5, %rax # sched: [1:0.50] -; X64-JAG-NEXT: subq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_31: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl $31, %eax -; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOOPT-NEXT: imull $31, {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_31: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_31: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_31: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: shlq $5, %rax # sched: [1:1.00] -; X64-SLM-NEXT: subq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_31: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_31: +; X64: # BB#0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shlq $5, %rax +; X64-NEXT: subq %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 31 ret i64 %mul } @@ -1712,168 +571,11 @@ define i64 @test_mul_by_32(i64 %x) { ; X86-NEXT: shll $5, %eax ; X86-NEXT: retl ; -; X64-HSW-LABEL: test_mul_by_32: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: shlq $5, %rdi # sched: [1:0.50] -; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_by_32: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: shlq $5, %rdi # sched: [1:0.50] -; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_by_32: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOOPT-NEXT: shldl $5, %eax, %edx -; X86-NOOPT-NEXT: shll $5, %eax -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_by_32: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: shlq $5, %rdi # sched: [1:0.50] -; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_by_32: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: shlq $5, %rdi # sched: [1:0.50] -; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.17] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_by_32: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: shlq $5, %rdi # sched: [1:1.00] -; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_by_32: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: shlq $5, %rdi # sched: [1:1.00] -; SLM-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] +; X64-LABEL: test_mul_by_32: +; X64: # BB#0: +; X64-NEXT: shlq $5, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq %mul = mul nsw i64 %x, 32 ret i64 %mul } - -; (x*9+42)*(x*5+2) -define i64 @test_mul_spec(i64 %x) nounwind { -; X86-LABEL: test_mul_spec: -; X86: # BB#0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl $9, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: leal (%edi,%edi,8), %ebx -; X86-NEXT: addl $42, %esi -; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: movl $5, %edx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: leal (%edi,%edi,4), %edi -; X86-NEXT: addl $2, %ecx -; X86-NEXT: adcl %edx, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: imull %esi, %edi -; X86-NEXT: addl %edi, %edx -; X86-NEXT: imull %ebx, %ecx -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: retl -; -; X64-HSW-LABEL: test_mul_spec: -; X64-HSW: # BB#0: -; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rcx # sched: [1:0.50] -; X64-HSW-NEXT: addq $42, %rcx # sched: [1:0.25] -; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq $2, %rax # sched: [1:0.25] -; X64-HSW-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; X64-HSW-NEXT: retq # sched: [1:1.00] -; -; X64-JAG-LABEL: test_mul_spec: -; X64-JAG: # BB#0: -; X64-JAG-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:0.50] -; X64-JAG-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:0.50] -; X64-JAG-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; X64-JAG-NEXT: retq # sched: [4:1.00] -; -; X86-NOOPT-LABEL: test_mul_spec: -; X86-NOOPT: # BB#0: -; X86-NOOPT-NEXT: pushl %ebx -; X86-NOOPT-NEXT: pushl %edi -; X86-NOOPT-NEXT: pushl %esi -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOOPT-NEXT: movl $9, %edx -; X86-NOOPT-NEXT: movl %ecx, %eax -; X86-NOOPT-NEXT: mull %edx -; X86-NOOPT-NEXT: movl %eax, %esi -; X86-NOOPT-NEXT: leal (%edi,%edi,8), %ebx -; X86-NOOPT-NEXT: addl $42, %esi -; X86-NOOPT-NEXT: adcl %edx, %ebx -; X86-NOOPT-NEXT: movl $5, %edx -; X86-NOOPT-NEXT: movl %ecx, %eax -; X86-NOOPT-NEXT: mull %edx -; X86-NOOPT-NEXT: movl %eax, %ecx -; X86-NOOPT-NEXT: leal (%edi,%edi,4), %edi -; X86-NOOPT-NEXT: addl $2, %ecx -; X86-NOOPT-NEXT: adcl %edx, %edi -; X86-NOOPT-NEXT: movl %esi, %eax -; X86-NOOPT-NEXT: mull %ecx -; X86-NOOPT-NEXT: imull %esi, %edi -; X86-NOOPT-NEXT: addl %edi, %edx -; X86-NOOPT-NEXT: imull %ebx, %ecx -; X86-NOOPT-NEXT: addl %ecx, %edx -; X86-NOOPT-NEXT: popl %esi -; X86-NOOPT-NEXT: popl %edi -; X86-NOOPT-NEXT: popl %ebx -; X86-NOOPT-NEXT: retl -; -; HSW-NOOPT-LABEL: test_mul_spec: -; HSW-NOOPT: # BB#0: -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rcx # sched: [1:0.50] -; HSW-NOOPT-NEXT: addq $42, %rcx # sched: [1:0.25] -; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50] -; HSW-NOOPT-NEXT: addq $2, %rax # sched: [1:0.25] -; HSW-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; HSW-NOOPT-NEXT: retq # sched: [1:1.00] -; -; JAG-NOOPT-LABEL: test_mul_spec: -; JAG-NOOPT: # BB#0: -; JAG-NOOPT-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:0.50] -; JAG-NOOPT-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:0.50] -; JAG-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; JAG-NOOPT-NEXT: retq # sched: [4:1.00] -; -; X64-SLM-LABEL: test_mul_spec: -; X64-SLM: # BB#0: -; X64-SLM-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:1.00] -; X64-SLM-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:1.00] -; X64-SLM-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; X64-SLM-NEXT: retq # sched: [4:1.00] -; -; SLM-NOOPT-LABEL: test_mul_spec: -; SLM-NOOPT: # BB#0: -; SLM-NOOPT-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:1.00] -; SLM-NOOPT-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:1.00] -; SLM-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00] -; SLM-NOOPT-NEXT: retq # sched: [4:1.00] - %mul = mul nsw i64 %x, 9 - %add = add nsw i64 %mul, 42 - %mul2 = mul nsw i64 %x, 5 - %add2 = add nsw i64 %mul2, 2 - %mul3 = mul nsw i64 %add, %add2 - ret i64 %mul3 -} diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll index d26cf02dd9424..0bda41a30c697 100644 --- a/test/CodeGen/X86/oddshuffles.ll +++ b/test/CodeGen/X86/oddshuffles.ll @@ -746,9 +746,9 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; SSE2-LABEL: interleave_24i8_in: ; SSE2: # BB#0: ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] @@ -791,17 +791,17 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; SSE42: # BB#0: ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE42-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] -; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8],zero,xmm2[1,9],zero,xmm2[2,10],zero,xmm2[3,11],zero,xmm2[4,12],zero,xmm2[5] +; SSE42-NEXT: movdqa %xmm1, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero -; SSE42-NEXT: por %xmm1, %xmm3 +; SSE42-NEXT: por %xmm2, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm0, %xmm2 -; SSE42-NEXT: movq %xmm2, 16(%rdi) +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] +; SSE42-NEXT: por %xmm0, %xmm1 +; SSE42-NEXT: movq %xmm1, 16(%rdi) ; SSE42-NEXT: movdqu %xmm3, (%rdi) ; SSE42-NEXT: retq ; @@ -809,16 +809,16 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; AVX: # BB#0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero -; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, 16(%rdi) -; AVX-NEXT: vmovdqu %xmm1, (%rdi) +; AVX-NEXT: vmovdqu %xmm2, (%rdi) ; AVX-NEXT: retq %s1 = load <8 x i8>, <8 x i8>* %q1, align 4 %s2 = load <8 x i8>, <8 x i8>* %q2, align 4 diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index 88cb7a6d58258..50a661fcca114 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -1152,9 +1152,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1166,9 +1166,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pmuludq %xmm3, %xmm0 -; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE41-NEXT: retq ; @@ -1312,17 +1312,17 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pmuludq %xmm7, %xmm5 +; SSE2-NEXT: pmuludq %xmm7, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: pmuludq %xmm8, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE2-NEXT: pmuludq %xmm0, %xmm5 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3] ; SSE2-NEXT: movaps %xmm4, %xmm0 ; SSE2-NEXT: movaps %xmm5, %xmm1 @@ -1331,22 +1331,22 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; SSE41-LABEL: mul_v8i64_zero_upper: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmuludq %xmm4, %xmm1 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; SSE41-NEXT: pmuludq %xmm5, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pmuludq %xmm6, %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero ; SSE41-NEXT: pmuludq %xmm7, %xmm1 -; SSE41-NEXT: pmuludq %xmm6, %xmm2 -; SSE41-NEXT: pmuludq %xmm5, %xmm0 -; SSE41-NEXT: pmuludq %xmm8, %xmm4 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] ; SSE41-NEXT: retq ; @@ -1356,11 +1356,11 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; @@ -1467,22 +1467,22 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE41-LABEL: mul_v8i64_sext: ; SSE41: # BB#0: ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm8 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm7 -; SSE41-NEXT: pmovsxwq %xmm0, %xmm5 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 +; SSE41-NEXT: pmovsxwq %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm3 +; SSE41-NEXT: pmuldq %xmm4, %xmm3 ; SSE41-NEXT: pmovsxdq %xmm2, %xmm2 +; SSE41-NEXT: pmuldq %xmm5, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm4 +; SSE41-NEXT: pmuldq %xmm6, %xmm4 ; SSE41-NEXT: pmovsxdq %xmm1, %xmm0 -; SSE41-NEXT: pmuldq %xmm5, %xmm0 -; SSE41-NEXT: pmuldq %xmm7, %xmm4 -; SSE41-NEXT: pmuldq %xmm6, %xmm2 -; SSE41-NEXT: pmuldq %xmm8, %xmm3 +; SSE41-NEXT: pmuldq %xmm7, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: retq ; @@ -1493,9 +1493,10 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 ; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: mul_v8i64_sext: diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll index 571dd6774906a..c54909cf93c19 100644 --- a/test/CodeGen/X86/pr32284.ll +++ b/test/CodeGen/X86/pr32284.ll @@ -1,81 +1,17 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown -mcpu=skx | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=i686-unknown -mcpu=skx -O0 | FileCheck %s --check-prefix=X86-O0 -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx | FileCheck %s --check-prefix=X64 -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx -O0 | FileCheck %s --check-prefix=X64-O0 +; RUN: llc -O0 -mtriple=x86_64-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc -mtriple=x86_64-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc -O0 -mtriple=i686-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,686 +; RUN: llc -mtriple=i686-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,686 +; REQUIRES: asserts @c = external constant i8, align 1 define void @foo() { -; X86-LABEL: foo: -; X86: # BB#0: # %entry -; X86-NEXT: subl $8, %esp -; X86-NEXT: .Lcfi0: -; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: movzbl c, %eax -; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %cl -; X86-NEXT: testb %al, %al -; X86-NEXT: setne {{[0-9]+}}(%esp) -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl %eax, %ecx -; X86-NEXT: setle %dl -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: addl $8, %esp -; X86-NEXT: retl -; -; X86-O0-LABEL: foo: -; X86-O0: # BB#0: # %entry -; X86-O0-NEXT: subl $12, %esp -; X86-O0-NEXT: .Lcfi0: -; X86-O0-NEXT: .cfi_def_cfa_offset 16 -; X86-O0-NEXT: movb c, %al -; X86-O0-NEXT: testb %al, %al -; X86-O0-NEXT: setne {{[0-9]+}}(%esp) -; X86-O0-NEXT: movzbl c, %ecx -; X86-O0-NEXT: testl %ecx, %ecx -; X86-O0-NEXT: setne %al -; X86-O0-NEXT: movzbl %al, %edx -; X86-O0-NEXT: subl %ecx, %edx -; X86-O0-NEXT: setle %al -; X86-O0-NEXT: andb $1, %al -; X86-O0-NEXT: movzbl %al, %ecx -; X86-O0-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-O0-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-O0-NEXT: addl $12, %esp -; X86-O0-NEXT: retl -; -; X64-LABEL: foo: -; X64: # BB#0: # %entry -; X64-NEXT: movzbl {{.*}}(%rip), %eax -; X64-NEXT: testb %al, %al -; X64-NEXT: setne -{{[0-9]+}}(%rsp) -; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testl %eax, %eax -; X64-NEXT: setne %cl -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: cmpl %eax, %ecx -; X64-NEXT: setle %dl -; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) -; X64-NEXT: retq -; -; X64-O0-LABEL: foo: -; X64-O0: # BB#0: # %entry -; X64-O0-NEXT: movb {{.*}}(%rip), %al -; X64-O0-NEXT: testb %al, %al -; X64-O0-NEXT: setne -{{[0-9]+}}(%rsp) -; X64-O0-NEXT: movzbl {{.*}}(%rip), %ecx -; X64-O0-NEXT: testl %ecx, %ecx -; X64-O0-NEXT: setne %al -; X64-O0-NEXT: movzbl %al, %edx -; X64-O0-NEXT: subl %ecx, %edx -; X64-O0-NEXT: setle %al -; X64-O0-NEXT: andb $1, %al -; X64-O0-NEXT: movzbl %al, %ecx -; X64-O0-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; X64-O0-NEXT: movl %edx, -{{[0-9]+}}(%rsp) # 4-byte Spill -; X64-O0-NEXT: retq +; CHECK-LABEL: foo: +; CHECK: # BB#0: # %entry +; CHECK-DAG: setne +; CHECK-DAG: setle +; CHECK: ret entry: %a = alloca i8, align 1 %b = alloca i32, align 4 @@ -100,3 +36,125 @@ entry: store i32 %conv8, i32* %b, align 4 ret void } + +@var_5 = external global i32, align 4 +@var_57 = external global i64, align 8 +@_ZN8struct_210member_2_0E = external global i64, align 8 + +define void @f1() { +; CHECK-LABEL: f1: +; CHECK: # BB#0: # %entry +; CHECK: sete +; X64: addq $7093, {{.*}} +; 686: addl $7093, {{.*}} +; CHECK: ret +entry: + %a = alloca i8, align 1 + %0 = load i32, i32* @var_5, align 4 + %conv = sext i32 %0 to i64 + %add = add nsw i64 %conv, 8381627093 + %tobool = icmp ne i64 %add, 0 + %frombool = zext i1 %tobool to i8 + store i8 %frombool, i8* %a, align 1 + %1 = load i32, i32* @var_5, align 4 + %neg = xor i32 %1, -1 + %tobool1 = icmp ne i32 %neg, 0 + %lnot = xor i1 %tobool1, true + %conv2 = zext i1 %lnot to i64 + %2 = load i32, i32* @var_5, align 4 + %conv3 = sext i32 %2 to i64 + %add4 = add nsw i64 %conv3, 7093 + %cmp = icmp sgt i64 %conv2, %add4 + %conv5 = zext i1 %cmp to i64 + store i64 %conv5, i64* @var_57, align 8 + %3 = load i32, i32* @var_5, align 4 + %neg6 = xor i32 %3, -1 + %tobool7 = icmp ne i32 %neg6, 0 + %lnot8 = xor i1 %tobool7, true + %conv9 = zext i1 %lnot8 to i64 + store i64 %conv9, i64* @_ZN8struct_210member_2_0E, align 8 + ret void +} + + +@var_7 = external global i8, align 1 + +define void @f2() { +; CHECK-LABEL: f2: +; CHECK: # BB#0: # %entry +; X64: movzbl {{.*}}(%rip), %[[R:[a-z]*]] +; 686: movzbl {{.*}}, %[[R:[a-z]*]] +; CHECK: test{{[qlwb]}} %[[R]], %[[R]] +; CHECK: sete {{.*}} +; CHECK: ret +entry: + %a = alloca i16, align 2 + %0 = load i8, i8* @var_7, align 1 + %conv = zext i8 %0 to i32 + %1 = load i8, i8* @var_7, align 1 + %tobool = icmp ne i8 %1, 0 + %lnot = xor i1 %tobool, true + %conv1 = zext i1 %lnot to i32 + %xor = xor i32 %conv, %conv1 + %conv2 = trunc i32 %xor to i16 + store i16 %conv2, i16* %a, align 2 + %2 = load i8, i8* @var_7, align 1 + %conv3 = zext i8 %2 to i16 + %tobool4 = icmp ne i16 %conv3, 0 + %lnot5 = xor i1 %tobool4, true + %conv6 = zext i1 %lnot5 to i32 + %3 = load i8, i8* @var_7, align 1 + %conv7 = zext i8 %3 to i32 + %cmp = icmp eq i32 %conv6, %conv7 + %conv8 = zext i1 %cmp to i32 + %conv9 = trunc i32 %conv8 to i16 + store i16 %conv9, i16* undef, align 2 + ret void +} + + +@var_13 = external global i32, align 4 +@var_16 = external global i32, align 4 +@var_46 = external global i32, align 4 + +define void @f3() #0 { +; CHECK-LABEL: f3: +; X64-DAG: movl var_13(%rip), {{.*}} +; X64-DAG: movl var_16(%rip), {{.*}} +; X64-DAG: movl {{.*}},{{.*}}var_46{{.*}} +; X64: retq +; 686-DAG: movl var_13, {{.*}} +; 686-DAG: movl var_16, {{.*}} +; 686-DAG: movl {{.*}},{{.*}}var_46{{.*}} +; 686: retl +entry: + %a = alloca i64, align 8 + %0 = load i32, i32* @var_13, align 4 + %neg = xor i32 %0, -1 + %conv = zext i32 %neg to i64 + %1 = load i32, i32* @var_13, align 4 + %tobool = icmp ne i32 %1, 0 + %lnot = xor i1 %tobool, true + %conv1 = zext i1 %lnot to i64 + %2 = load i32, i32* @var_13, align 4 + %neg2 = xor i32 %2, -1 + %3 = load i32, i32* @var_16, align 4 + %xor = xor i32 %neg2, %3 + %conv3 = zext i32 %xor to i64 + %and = and i64 %conv1, %conv3 + %or = or i64 %conv, %and + store i64 %or, i64* %a, align 8 + %4 = load i32, i32* @var_13, align 4 + %neg4 = xor i32 %4, -1 + %conv5 = zext i32 %neg4 to i64 + %5 = load i32, i32* @var_13, align 4 + %tobool6 = icmp ne i32 %5, 0 + %lnot7 = xor i1 %tobool6, true + %conv8 = zext i1 %lnot7 to i64 + %and9 = and i64 %conv8, 0 + %or10 = or i64 %conv5, %and9 + %conv11 = trunc i64 %or10 to i32 + store i32 %conv11, i32* @var_46, align 4 + ret void +} + diff --git a/test/CodeGen/X86/pr32610.ll b/test/CodeGen/X86/pr32610.ll new file mode 100644 index 0000000000000..1116cf6f1b29a --- /dev/null +++ b/test/CodeGen/X86/pr32610.ll @@ -0,0 +1,40 @@ +; RUN: llc -o - %s | FileCheck %s + +; CHECK-LABEL: @pr32610 +; CHECK: movl L_b$non_lazy_ptr, [[BASEREG:%[a-z]+]] +; CHECK: cmpl ([[BASEREG]]), {{%[a-z]+}} +; CHECK: cmpl ([[BASEREG]]), {{%[a-z]+}} + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.13.0" + +@c = external local_unnamed_addr global i32, align 4 +@b = external local_unnamed_addr global [1 x i32], align 4 +@d = external local_unnamed_addr global i32, align 4 + +; Function Attrs: norecurse nounwind optsize ssp +define void @pr32610() local_unnamed_addr #0 { +entry: + %0 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i32 0, i32 undef), align 4, !tbaa !1 + %cmp = icmp eq i32 undef, %0 + %conv = zext i1 %cmp to i32 + %tobool1.i = icmp ne i32 undef, 0 + %or.cond.i = and i1 %cmp, %tobool1.i + %cond.i = select i1 %or.cond.i, i32 %conv, i32 undef + store i32 %cond.i, i32* @c, align 4, !tbaa !1 + %1 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @b, i32 0, i32 0), align 4 + %tobool = icmp ne i32 %1, 0 + %2 = select i1 %tobool, i32 %1, i32 undef + store i32 %2, i32* @d, align 4, !tbaa !1 + ret void +} + +attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 5.0.0 (trunk 301507) (llvm/trunk 301505)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll index 5d5150ad62d60..4be3a4c2391b4 100644 --- a/test/CodeGen/X86/rotate.ll +++ b/test/CodeGen/X86/rotate.ll @@ -33,8 +33,8 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind { ; 32-NEXT: movl %ebx, %esi ; 32-NEXT: xorl %ebx, %ebx ; 32-NEXT: .LBB0_4: -; 32-NEXT: orl %esi, %eax ; 32-NEXT: orl %ebx, %edx +; 32-NEXT: orl %esi, %eax ; 32-NEXT: popl %esi ; 32-NEXT: popl %edi ; 32-NEXT: popl %ebx @@ -86,8 +86,8 @@ define i64 @rotr64(i64 %A, i8 %Amt) nounwind { ; 32-NEXT: movl %ebx, %esi ; 32-NEXT: xorl %ebx, %ebx ; 32-NEXT: .LBB1_4: -; 32-NEXT: orl %ebx, %eax ; 32-NEXT: orl %esi, %edx +; 32-NEXT: orl %ebx, %eax ; 32-NEXT: popl %esi ; 32-NEXT: popl %edi ; 32-NEXT: popl %ebx @@ -546,7 +546,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { ; 32-LABEL: rotr1_64_mem: ; 32: # BB#0: ; 32-NEXT: pushl %esi -; 32-NEXT: movl 8(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: movl (%eax), %ecx ; 32-NEXT: movl 4(%eax), %edx ; 32-NEXT: movl %edx, %esi @@ -555,11 +555,13 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { ; 32-NEXT: movl %ecx, 4(%eax) ; 32-NEXT: movl %esi, (%eax) ; 32-NEXT: popl %esi - +; 32-NEXT: retl +; ; 64-LABEL: rotr1_64_mem: ; 64: # BB#0: ; 64-NEXT: rorq (%rdi) ; 64-NEXT: retq + %A = load i64, i64 *%Aptr %B = shl i64 %A, 63 %C = lshr i64 %A, 1 @@ -571,7 +573,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { define void @rotr1_32_mem(i32* %Aptr) nounwind { ; 32-LABEL: rotr1_32_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorl (%eax) ; 32-NEXT: retl ; @@ -590,7 +592,7 @@ define void @rotr1_32_mem(i32* %Aptr) nounwind { define void @rotr1_16_mem(i16* %Aptr) nounwind { ; 32-LABEL: rotr1_16_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorw (%eax) ; 32-NEXT: retl ; @@ -609,7 +611,7 @@ define void @rotr1_16_mem(i16* %Aptr) nounwind { define void @rotr1_8_mem(i8* %Aptr) nounwind { ; 32-LABEL: rotr1_8_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorb (%eax) ; 32-NEXT: retl ; diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll index b8a8b8afd14fd..6a565a5c76f0b 100644 --- a/test/CodeGen/X86/sad.ll +++ b/test/CodeGen/X86/sad.ll @@ -149,127 +149,131 @@ middle.block: define i32 @sad_32i8() nounwind { ; SSE2-LABEL: sad_32i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm12, %xmm12 -; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm14, %xmm14 +; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa a+1040(%rax), %xmm6 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa a+1040(%rax), %xmm8 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] +; SSE2-NEXT: movdqa b+1024(%rax), %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm10, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: psubd %xmm2, %xmm7 ; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; SSE2-NEXT: psubd %xmm9, %xmm6 -; SSE2-NEXT: movdqa b+1024(%rax), %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE2-NEXT: psubd %xmm10, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSE2-NEXT: psubd %xmm10, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm11, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] ; SSE2-NEXT: psubd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSE2-NEXT: psubd %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE2-NEXT: psubd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: psubd %xmm2, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] -; SSE2-NEXT: psubd %xmm4, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm2, %xmm10 -; SSE2-NEXT: movdqa %xmm8, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm8 -; SSE2-NEXT: pxor %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm2, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE2-NEXT: psubd %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] ; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm2, %xmm6 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm6, %xmm14 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: psubd %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; SSE2-NEXT: psubd %xmm6, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSE2-NEXT: psubd %xmm6, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; SSE2-NEXT: psubd %xmm9, %xmm8 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm7 ; SSE2-NEXT: paddd %xmm7, %xmm13 -; SSE2-NEXT: paddd %xmm1, %xmm15 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm10, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm14 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm15 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm8, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # BB#2: # %middle.block -; SSE2-NEXT: paddd %xmm15, %xmm3 -; SSE2-NEXT: paddd %xmm14, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm0 -; SSE2-NEXT: paddd %xmm13, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm15, %xmm6 +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: paddd %xmm14, %xmm13 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm3, %xmm4 +; SSE2-NEXT: paddd %xmm13, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -398,288 +402,284 @@ middle.block: define i32 @sad_avx64i8() nounwind { ; SSE2-LABEL: sad_avx64i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: subq $184, %rsp -; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: subq $200, %rsp +; SSE2-NEXT: pxor %xmm14, %xmm14 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm14, %xmm14 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm11, %xmm11 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm13, %xmm13 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm13, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm14, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm8, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa a+1040(%rax), %xmm6 -; SSE2-NEXT: movdqa a+1024(%rax), %xmm4 -; SSE2-NEXT: movdqa a+1056(%rax), %xmm11 -; SSE2-NEXT: movdqa a+1072(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; SSE2-NEXT: movdqa %xmm4, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm6, %xmm14 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm13 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm6 -; SSE2-NEXT: movdqa b+1024(%rax), %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; SSE2-NEXT: psubd %xmm10, %xmm8 -; SSE2-NEXT: movdqa %xmm13, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: psubd %xmm13, %xmm14 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: psubd %xmm9, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm4 -; SSE2-NEXT: movdqa b+1056(%rax), %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: psubd %xmm10, %xmm12 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE2-NEXT: psubd %xmm2, %xmm11 -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; SSE2-NEXT: psubd %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm10 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm13 -; SSE2-NEXT: movdqa b+1072(%rax), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE2-NEXT: psubd %xmm2, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movaps a+1040(%rax), %xmm0 +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa a+1024(%rax), %xmm12 +; SSE2-NEXT: movdqa a+1056(%rax), %xmm15 +; SSE2-NEXT: movdqa a+1072(%rax), %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm15, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm11, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm15, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: psubd %xmm0, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: pxor %xmm0, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm12, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] ; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: pxor %xmm0, %xmm10 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm13, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: pxor %xmm0, %xmm13 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: pxor %xmm0, %xmm11 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] ; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: pxor %xmm0, %xmm12 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm0, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; SSE2-NEXT: movdqa b+1072(%rax), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: movdqa b+1056(%rax), %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: psubd %xmm7, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm8 +; SSE2-NEXT: movdqa b+1024(%rax), %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: psubd %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: psubd %xmm0, %xmm15 +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm9 +; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: psubd %xmm0, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: psubd %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm12 +; SSE2-NEXT: movdqa b+1040(%rax), %xmm13 +; SSE2-NEXT: movdqa %xmm13, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: psubd %xmm7, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: psubd %xmm3, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm13, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; SSE2-NEXT: psubd %xmm13, %xmm2 ; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm11 +; SSE2-NEXT: pxor %xmm1, %xmm11 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm11 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm15, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm15 +; SSE2-NEXT: pxor %xmm1, %xmm15 +; SSE2-NEXT: paddd %xmm15, %xmm2 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm15 +; SSE2-NEXT: movdqa %xmm10, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm12 +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm9, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: paddd %xmm0, %xmm9 +; SSE2-NEXT: pxor %xmm0, %xmm9 +; SSE2-NEXT: paddd %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm7 ; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: movdqa %xmm14, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm14 -; SSE2-NEXT: pxor %xmm0, %xmm14 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm6 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: paddd %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm14, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm12, %xmm8 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm0, %xmm12 -; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm7 +; SSE2-NEXT: pxor %xmm0, %xmm7 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: paddd %xmm13, %xmm7 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm10, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm3 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; SSE2-NEXT: paddd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: paddd %xmm7, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # BB#2: # %middle.block -; SSE2-NEXT: paddd %xmm2, %xmm4 -; SSE2-NEXT: paddd %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm2 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm13, %xmm14 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm3, %xmm8 +; SSE2-NEXT: paddd %xmm2, %xmm15 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm13 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm5, %xmm0 +; SSE2-NEXT: paddd %xmm11, %xmm10 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: paddd %xmm13, %xmm1 +; SSE2-NEXT: paddd %xmm15, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm14, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm8, %xmm7 -; SSE2-NEXT: paddd %xmm6, %xmm7 -; SSE2-NEXT: paddd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1] -; SSE2-NEXT: paddd %xmm7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: addq $184, %rsp +; SSE2-NEXT: addq $200, %rsp ; SSE2-NEXT: retq ; ; AVX2-LABEL: sad_avx64i8: @@ -688,8 +688,8 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm6, %ymm6, %ymm6 ; AVX2-NEXT: vpxor %ymm5, %ymm5, %ymm5 ; AVX2-NEXT: vpxor %ymm7, %ymm7, %ymm7 @@ -697,7 +697,6 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: .LBB2_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero @@ -705,48 +704,49 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm8, %ymm15, %ymm8 +; AVX2-NEXT: vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm8 +; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 +; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 +; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 +; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 -; AVX2-NEXT: vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill +; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload -; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm15 -; AVX2-NEXT: vpabsd %ymm8, %ymm8 +; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload +; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm15 +; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpabsd %ymm9, %ymm8 +; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpabsd %ymm10, %ymm8 +; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 +; AVX2-NEXT: vpabsd %ymm11, %ymm8 ; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3 -; AVX2-NEXT: vpabsd %ymm14, %ymm8 -; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 -; AVX2-NEXT: vpabsd %ymm13, %ymm8 -; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 ; AVX2-NEXT: vpabsd %ymm12, %ymm8 ; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpabsd %ymm11, %ymm8 -; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 -; AVX2-NEXT: vpabsd %ymm10, %ymm8 -; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 -; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpabsd %ymm13, %ymm8 +; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 +; AVX2-NEXT: vpabsd %ymm14, %ymm8 +; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 ; AVX2-NEXT: vpabsd %ymm15, %ymm8 -; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 ; AVX2-NEXT: addq $4, %rax ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # BB#2: # %middle.block ; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm7, %ymm4, %ymm4 +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -773,21 +773,21 @@ define i32 @sad_avx64i8() nounwind { ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpsubd %zmm11, %zmm7, %zmm7 -; AVX512F-NEXT: vpsubd %zmm10, %zmm6, %zmm6 -; AVX512F-NEXT: vpsubd %zmm9, %zmm5, %zmm5 ; AVX512F-NEXT: vpsubd %zmm8, %zmm4, %zmm4 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm5, %zmm5 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm6, %zmm6 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm7, %zmm7 ; AVX512F-NEXT: vpabsd %zmm4, %zmm4 -; AVX512F-NEXT: vpabsd %zmm5, %zmm5 -; AVX512F-NEXT: vpabsd %zmm6, %zmm6 -; AVX512F-NEXT: vpabsd %zmm7, %zmm7 -; AVX512F-NEXT: vpaddd %zmm3, %zmm7, %zmm3 -; AVX512F-NEXT: vpaddd %zmm2, %zmm6, %zmm2 -; AVX512F-NEXT: vpaddd %zmm1, %zmm5, %zmm1 ; AVX512F-NEXT: vpaddd %zmm0, %zmm4, %zmm0 +; AVX512F-NEXT: vpabsd %zmm5, %zmm4 +; AVX512F-NEXT: vpaddd %zmm1, %zmm4, %zmm1 +; AVX512F-NEXT: vpabsd %zmm6, %zmm4 +; AVX512F-NEXT: vpaddd %zmm2, %zmm4, %zmm2 +; AVX512F-NEXT: vpabsd %zmm7, %zmm4 +; AVX512F-NEXT: vpaddd %zmm3, %zmm4, %zmm3 ; AVX512F-NEXT: addq $4, %rax ; AVX512F-NEXT: jne .LBB2_1 ; AVX512F-NEXT: # BB#2: # %middle.block @@ -1154,59 +1154,54 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; SSE2-LABEL: sad_nonloop_32i8: ; SSE2: # BB#0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu 16(%rdi), %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm12, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm13 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm13, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqu (%rdx), %xmm5 -; SSE2-NEXT: movdqu 16(%rdx), %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm7, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm5, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: psubd %xmm5, %xmm0 -; SSE2-NEXT: psubd %xmm7, %xmm3 -; SSE2-NEXT: psubd %xmm2, %xmm13 -; SSE2-NEXT: psubd %xmm1, %xmm12 -; SSE2-NEXT: psubd %xmm8, %xmm6 -; SSE2-NEXT: psubd %xmm15, %xmm11 -; SSE2-NEXT: psubd %xmm14, %xmm10 -; SSE2-NEXT: psubd -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqu 16(%rdi), %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm12, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm12, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqu (%rdx), %xmm7 +; SSE2-NEXT: movdqu 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm10 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm13 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE2-NEXT: psubd %xmm6, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psubd %xmm2, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: psubd %xmm3, %xmm12 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE2-NEXT: psubd %xmm7, %xmm0 ; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm10 @@ -1215,33 +1210,37 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm11 ; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm12 ; SSE2-NEXT: movdqa %xmm13, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm13 ; SSE2-NEXT: pxor %xmm1, %xmm13 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm13, %xmm4 +; SSE2-NEXT: paddd %xmm10, %xmm4 +; SSE2-NEXT: paddd %xmm11, %xmm4 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm12 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: paddd %xmm11, %xmm6 -; SSE2-NEXT: paddd %xmm9, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm6 ; SSE2-NEXT: paddd %xmm12, %xmm0 -; SSE2-NEXT: paddd %xmm6, %xmm0 -; SSE2-NEXT: paddd %xmm13, %xmm0 +; SSE2-NEXT: paddd %xmm8, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll index ce42d0d643e8b..1afef86a5f11d 100644 --- a/test/CodeGen/X86/select.ll +++ b/test/CodeGen/X86/select.ll @@ -299,20 +299,21 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2) ; GENERIC-NEXT: testb %dil, %dil ; GENERIC-NEXT: jne LBB7_4 ; GENERIC-NEXT: ## BB#5: +; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; GENERIC-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; GENERIC-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; GENERIC-NEXT: jmp LBB7_6 ; GENERIC-NEXT: LBB7_4: -; GENERIC-NEXT: movd %r9d, %xmm2 -; GENERIC-NEXT: movd %ecx, %xmm3 -; GENERIC-NEXT: movd %r8d, %xmm4 +; GENERIC-NEXT: movd %r9d, %xmm1 +; GENERIC-NEXT: movd %ecx, %xmm2 +; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; GENERIC-NEXT: movd %r8d, %xmm3 ; GENERIC-NEXT: movd %edx, %xmm1 ; GENERIC-NEXT: LBB7_6: -; GENERIC-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm1 ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm0 ; GENERIC-NEXT: movq %xmm0, 16(%rsi) @@ -339,16 +340,19 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2) ; ATOM-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; ATOM-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; ATOM-NEXT: jmp LBB7_6 ; ATOM-NEXT: LBB7_4: -; ATOM-NEXT: movd %r9d, %xmm2 -; ATOM-NEXT: movd %ecx, %xmm3 -; ATOM-NEXT: movd %r8d, %xmm4 +; ATOM-NEXT: movd %r9d, %xmm1 +; ATOM-NEXT: movd %ecx, %xmm2 +; ATOM-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; ATOM-NEXT: movd %r8d, %xmm3 ; ATOM-NEXT: movd %edx, %xmm1 -; ATOM-NEXT: LBB7_6: -; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; ATOM-NEXT: LBB7_6: ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm0 ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm1 ; ATOM-NEXT: movq %xmm0, 16(%rsi) diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll index 1b8f8e7ae559c..2628f824ea407 100644 --- a/test/CodeGen/X86/setcc-lowering.ll +++ b/test/CodeGen/X86/setcc-lowering.ll @@ -45,64 +45,21 @@ define void @pr26232(i64 %a, <16 x i1> %b) { ; AVX-LABEL: pr26232: ; AVX: # BB#0: # %for_loop599.preheader ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB1_1: # %for_loop599 ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: cmpq $65536, %rdi # imm = 0x10000 ; AVX-NEXT: setl %al -; AVX-NEXT: vmovd %eax, %xmm2 -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm0, %xmm2, %xmm2 -; AVX-NEXT: vpextrb $15, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $14, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $13, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $12, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $11, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $10, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $9, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $8, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $7, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $6, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $5, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $4, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $3, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $2, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $1, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vpextrb $0, %xmm2, %eax -; AVX-NEXT: andb $1, %al -; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; AVX-NEXT: cmpw $0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: vmovd %eax, %xmm3 +; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX-NEXT: vpand %xmm0, %xmm3, %xmm3 +; AVX-NEXT: vpsllw $7, %xmm3, %xmm3 +; AVX-NEXT: vpand %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3 +; AVX-NEXT: vpmovmskb %xmm3, %eax +; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: jne .LBB1_1 ; AVX-NEXT: # BB#2: # %for_exit600 ; AVX-NEXT: retq diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll index 2996edaec3e0e..332bf2887fb05 100644 --- a/test/CodeGen/X86/setcc-wide-types.ll +++ b/test/CodeGen/X86/setcc-wide-types.ll @@ -58,25 +58,25 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: ne_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r8 +; SSE2-NEXT: movq %xmm4, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r9 -; SSE2-NEXT: movq %xmm0, %r10 -; SSE2-NEXT: movq %xmm1, %rsi +; SSE2-NEXT: movq %xmm4, %rcx +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %xmm1, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: movq %xmm3, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: xorq %r10, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rax, %rdi +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorq %rcx, %rsi +; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorq %rdx, %rax +; SSE2-NEXT: movq %xmm3, %rcx +; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rcx, %rdi +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -100,25 +100,25 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: eq_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r8 +; SSE2-NEXT: movq %xmm4, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r9 -; SSE2-NEXT: movq %xmm0, %r10 -; SSE2-NEXT: movq %xmm1, %rsi +; SSE2-NEXT: movq %xmm4, %rcx +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %xmm1, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: movq %xmm3, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: xorq %r10, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rax, %rdi +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorq %rcx, %rsi +; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorq %rdx, %rax +; SSE2-NEXT: movq %xmm3, %rcx +; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rcx, %rdi +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; diff --git a/test/CodeGen/X86/shrink_vmul_sse.ll b/test/CodeGen/X86/shrink_vmul_sse.ll index c869dff9e6423..6701c247e6fc5 100644 --- a/test/CodeGen/X86/shrink_vmul_sse.ll +++ b/test/CodeGen/X86/shrink_vmul_sse.ll @@ -20,9 +20,9 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; CHECK-NEXT: movzbl 1(%edx,%ecx), %edi ; CHECK-NEXT: movzbl (%edx,%ecx), %edx ; CHECK-NEXT: movzbl 1(%eax,%ecx), %ebx +; CHECK-NEXT: imull %edi, %ebx ; CHECK-NEXT: movzbl (%eax,%ecx), %eax ; CHECK-NEXT: imull %edx, %eax -; CHECK-NEXT: imull %edi, %ebx ; CHECK-NEXT: movl %ebx, 4(%esi,%ecx,4) ; CHECK-NEXT: movl %eax, (%esi,%ecx,4) ; CHECK-NEXT: popl %esi diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index 503b9416c8d38..4a0dc9c1eb171 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -273,8 +273,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; X32: ## BB#0: ## %entry ; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X32-NEXT: addss %xmm1, %xmm0 ; X32-NEXT: addss %xmm2, %xmm3 +; X32-NEXT: addss %xmm1, %xmm0 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X32-NEXT: retl ; @@ -282,8 +282,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; X64: ## BB#0: ## %entry ; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X64-NEXT: addss %xmm1, %xmm0 ; X64-NEXT: addss %xmm2, %xmm3 +; X64-NEXT: addss %xmm1, %xmm0 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X64-NEXT: retq entry: @@ -896,9 +896,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X32-NEXT: addps %xmm1, %xmm0 ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; X32-NEXT: addps %xmm1, %xmm0 ; X32-NEXT: addps %xmm2, %xmm3 ; X32-NEXT: addps %xmm3, %xmm0 ; X32-NEXT: retl @@ -908,9 +908,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: addps %xmm2, %xmm3 ; X64-NEXT: addps %xmm3, %xmm0 ; X64-NEXT: retq diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll index 226c0adbaf3c3..2fb821555dba5 100644 --- a/test/CodeGen/X86/vector-bitreverse.ll +++ b/test/CodeGen/X86/vector-bitreverse.ll @@ -2372,10 +2372,10 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm2 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 -; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm3 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3 ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm2 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 ; AVX512F-NEXT: vpsllq $24, %zmm0, %zmm3 diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll index a05a981daa1f0..f0a5fe1dbfffb 100644 --- a/test/CodeGen/X86/vector-blend.ll +++ b/test/CodeGen/X86/vector-blend.ll @@ -848,10 +848,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) { ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: blend_logic_v8i32: @@ -860,10 +860,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) { ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm3 ; SSSE3-NEXT: pandn %xmm5, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: pand %xmm0, %xmm2 ; SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: blend_logic_v8i32: diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index f4d0503f4a792..4181a374c61ce 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -11,13 +11,13 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) { ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 +; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm4 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX-NEXT: vaddpd %ymm2, %ymm4, %ymm2 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm1 -; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ; AVX-NEXT: retq %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> @@ -39,11 +39,11 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) { ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmulpd %ymm0, %ymm4, %ymm0 ; AVX-NEXT: retq %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> @@ -120,9 +120,9 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) { ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16 diff --git a/test/CodeGen/X86/xchg-nofold.ll b/test/CodeGen/X86/xchg-nofold.ll index fddc7906e08f9..939fa0404223d 100644 --- a/test/CodeGen/X86/xchg-nofold.ll +++ b/test/CodeGen/X86/xchg-nofold.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-linux-gnu < %s | FileCheck %s %"struct.std::atomic" = type { %"struct.std::atomic_bool" } @@ -6,6 +7,28 @@ ; CHECK-LABEL: _Z3fooRSt6atomicIbEb define zeroext i1 @_Z3fooRSt6atomicIbEb(%"struct.std::atomic"* nocapture dereferenceable(1) %a, i1 returned zeroext %b) nounwind { +; CHECK-LABEL: _Z3fooRSt6atomicIbEb: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq $3, %rax +; CHECK-NEXT: movb 2147450880(%rax), %al +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je .LBB0_3 +; CHECK-NEXT: # BB#1: +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andl $7, %ecx +; CHECK-NEXT: cmpb %al, %cl +; CHECK-NEXT: jge .LBB0_2 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: xchgb %al, (%rdi) +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq __asan_report_store1 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP entry: %frombool.i.i = zext i1 %b to i8 %_M_i.i.i = getelementptr inbounds %"struct.std::atomic", %"struct.std::atomic"* %a, i64 0, i32 0, i32 0, i32 0 @@ -30,7 +53,6 @@ entry: ; <label>:11: ; preds = %6, %entry store atomic i8 %frombool.i.i, i8* %_M_i.i.i seq_cst, align 1 -; CHECK: xchgb %{{.*}}, (%{{.*}}) ret i1 %b } diff --git a/test/DebugInfo/MIR/X86/empty-inline.mir b/test/DebugInfo/MIR/X86/empty-inline.mir index 1766a8f446160..71d10fe9de94c 100644 --- a/test/DebugInfo/MIR/X86/empty-inline.mir +++ b/test/DebugInfo/MIR/X86/empty-inline.mir @@ -73,7 +73,6 @@ name: _ZN1C5m_fn3Ev alignment: 4 exposesReturnsTwice: false -noVRegs: true legalized: false regBankSelected: false selected: false diff --git a/test/DebugInfo/omit-empty.ll b/test/DebugInfo/omit-empty.ll index 92450050d2089..8b277676f94ca 100644 --- a/test/DebugInfo/omit-empty.ll +++ b/test/DebugInfo/omit-empty.ll @@ -1,4 +1,5 @@ ; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-objdump -h - | FileCheck %s +; REQUIRES: default_triple ; CHECK-NOT: .debug_ diff --git a/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll b/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll index 092c9dc6b95be..f7f63bd6be807 100644 --- a/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll +++ b/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll @@ -14,8 +14,8 @@ ; clang++ ../1.cc -O3 -g -S -emit-llvm -fno-strict-aliasing ; and add sanitize_address to @_ZN1A1fEv -; Test that __sanitizer_cov call has !dbg pointing to the opening { of A::f(). -; CHECK: call void @__sanitizer_cov(i32*{{.*}}), !dbg [[A:!.*]] +; Test that __sanitizer_cov_trace_pc_guard call has !dbg pointing to the opening { of A::f(). +; CHECK: call void @__sanitizer_cov_trace_pc_guard(i32*{{.*}}), !dbg [[A:!.*]] ; CHECK: [[A]] = !DILocation(line: 6, scope: !{{.*}}) diff --git a/test/Instrumentation/SanitizerCoverage/coverage.ll b/test/Instrumentation/SanitizerCoverage/coverage.ll index d675c9d9c3709..7b6b5f00442fe 100644 --- a/test/Instrumentation/SanitizerCoverage/coverage.ll +++ b/test/Instrumentation/SanitizerCoverage/coverage.ll @@ -1,16 +1,5 @@ -; RUN: opt < %s -sancov -sanitizer-coverage-level=0 -S | FileCheck %s --check-prefix=CHECK0 -; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -S | FileCheck %s --check-prefix=CHECK1 -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -S | FileCheck %s --check-prefix=CHECK_WITH_CHECK -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=10 -S | FileCheck %s --check-prefix=CHECK2 -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=0 -S | FileCheck %s --check-prefix=CHECK_WITH_CHECK -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=1 -S | FileCheck %s --check-prefix=CHECK_WITH_CHECK -; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-block-threshold=10 -S | FileCheck %s --check-prefix=CHECK3 ; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -S | FileCheck %s --check-prefix=CHECK_TRACE_PC -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=10 \ -; RUN: -S | FileCheck %s --check-prefix=CHECK2 -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=1 \ -; RUN: -S | FileCheck %s --check-prefix=CHECK_WITH_CHECK ; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-prune-blocks=1 -S | FileCheck %s --check-prefix=CHECKPRUNE target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" diff --git a/test/Instrumentation/SanitizerCoverage/seh.ll b/test/Instrumentation/SanitizerCoverage/seh.ll index ce18334ed2074..f432573af64a6 100644 --- a/test/Instrumentation/SanitizerCoverage/seh.ll +++ b/test/Instrumentation/SanitizerCoverage/seh.ll @@ -1,7 +1,6 @@ ; RUN: opt < %s -sancov -sanitizer-coverage-level=0 -S | FileCheck %s ; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -S | FileCheck %s ; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -S | FileCheck %s -; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=0 -S | FileCheck %s target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" target triple = "i686-pc-windows-msvc18.0.0" diff --git a/test/MC/AMDGPU/ds-err.s b/test/MC/AMDGPU/ds-err.s index 3951efbb60f98..d9f22f5f3ed20 100644 --- a/test/MC/AMDGPU/ds-err.s +++ b/test/MC/AMDGPU/ds-err.s @@ -21,3 +21,93 @@ ds_write2_b32 v2, v4, v6 offset0:1000000000 // CHECK: invalid operand for instruction ds_write2_b32 v2, v4, v6 offset1:1000000000 +//===----------------------------------------------------------------------===// +// swizzle +//===----------------------------------------------------------------------===// + +// CHECK: error: expected a colon +ds_swizzle_b32 v8, v2 offset + +// CHECK: error: failed parsing operand +ds_swizzle_b32 v8, v2 offset: + +// CHECK: error: expected a colon +ds_swizzle_b32 v8, v2 offset- + +// CHECK: error: expected absolute expression +ds_swizzle_b32 v8, v2 offset:SWIZZLE(QUAD_PERM, 0, 1, 2, 3) + +// CHECK: error: expected a swizzle mode +ds_swizzle_b32 v8, v2 offset:swizzle(quad_perm, 0, 1, 2, 3) + +// CHECK: error: expected a swizzle mode +ds_swizzle_b32 v8, v2 offset:swizzle(XXX,1) + +// CHECK: error: expected a comma +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM + +// CHECK: error: expected a comma +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 0, 1, 2) + +// CHECK: error: expected a closing parentheses +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 0, 1, 2, 3 + +// CHECK: error: expected a closing parentheses +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 0, 1, 2, 3, 4) + +// CHECK: error: expected a 2-bit lane id +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, -1, 1, 2, 3) + +// CHECK: error: expected a 2-bit lane id +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 4, 1, 2, 3) + +// CHECK: error: group size must be in the interval [1,16] +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,0) + +// CHECK: error: group size must be a power of two +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,3) + +// CHECK: error: group size must be in the interval [1,16] +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,17) + +// CHECK: error: group size must be in the interval [1,16] +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,32) + +// CHECK: error: group size must be in the interval [2,32] +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,1) + +// CHECK: error: group size must be a power of two +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,3) + +// CHECK: error: group size must be in the interval [2,32] +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,33) + +// CHECK: error: group size must be in the interval [2,32] +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,1,0) + +// CHECK: error: group size must be a power of two +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,3,1) + +// CHECK: error: group size must be in the interval [2,32] +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,33,1) + +// CHECK: error: lane id must be in the interval [0,group size - 1] +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,-1) + +// CHECK: error: lane id must be in the interval [0,group size - 1] +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,2) + +// CHECK: error: expected a string +ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, pppii) + +// CHECK: error: expected a 5-character mask +ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "") + +// CHECK: error: expected a 5-character mask +ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "ppii") + +// CHECK: error: expected a 5-character mask +ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "pppiii") + +// CHECK: invalid mask +ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "pppi2") diff --git a/test/MC/AMDGPU/ds.s b/test/MC/AMDGPU/ds.s index 18e4957e32d75..ef36a98f746ac 100644 --- a/test/MC/AMDGPU/ds.s +++ b/test/MC/AMDGPU/ds.s @@ -267,10 +267,6 @@ ds_max_rtn_f32 v8, v2, v4 // SICI: ds_max_rtn_f32 v8, v2, v4 ; encoding: [0x00,0x00,0xcc,0xd8,0x02,0x04,0x00,0x08] // VI: ds_max_rtn_f32 v8, v2, v4 ; encoding: [0x00,0x00,0x66,0xd8,0x02,0x04,0x00,0x08] -ds_swizzle_b32 v8, v2 -// SICI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] -// VI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] - ds_read_b32 v8, v2 // SICI: ds_read_b32 v8, v2 ; encoding: [0x00,0x00,0xd8,0xd8,0x02,0x00,0x00,0x08] // VI: ds_read_b32 v8, v2 ; encoding: [0x00,0x00,0x6c,0xd8,0x02,0x00,0x00,0x08] @@ -506,3 +502,143 @@ ds_nop // NOSI: error: instruction not supported on this GPU // CI: ds_nop ; encoding: [0x00,0x00,0x50,0xd8,0x00,0x00,0x00,0x00] // VI: ds_nop ; encoding: [0x00,0x00,0x28,0xd8,0x00,0x00,0x00,0x00] + +//===----------------------------------------------------------------------===// +// swizzle +//===----------------------------------------------------------------------===// + +ds_swizzle_b32 v8, v2 +// SICI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0xFFFF +// SICI: ds_swizzle_b32 v8, v2 offset:65535 ; encoding: [0xff,0xff,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 0, 1, 2, 3) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM,0,1,2,3) ; encoding: [0xe4,0x80,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM,0,1,2,3) ; encoding: [0xe4,0x80,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM, 2, 1, 3, 3) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM,2,1,3,3) ; encoding: [0xf6,0x80,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(QUAD_PERM,2,1,3,3) ; encoding: [0xf6,0x80,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,1) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,1) ; encoding: [0x1f,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,1) ; encoding: [0x1f,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,2) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,2) ; encoding: [0x1f,0x08,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,2) ; encoding: [0x1f,0x08,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,4) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,4) ; encoding: [0x1f,0x10,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,4) ; encoding: [0x1f,0x10,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,8) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,8) ; encoding: [0x1f,0x20,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,8) ; encoding: [0x1f,0x20,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,16) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,16) ; encoding: [0x1f,0x40,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,16) ; encoding: [0x1f,0x40,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,2) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,1) ; encoding: [0x1f,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(SWAP,1) ; encoding: [0x1f,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,4) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,4) ; encoding: [0x1f,0x0c,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,4) ; encoding: [0x1f,0x0c,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,8) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,8) ; encoding: [0x1f,0x1c,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,8) ; encoding: [0x1f,0x1c,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,16) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,16) ; encoding: [0x1f,0x3c,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,16) ; encoding: [0x1f,0x3c,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,32) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,32) ; encoding: [0x1f,0x7c,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,32) ; encoding: [0x1f,0x7c,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,1) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,1) ; encoding: [0x3e,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,1) ; encoding: [0x3e,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,1) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,1) ; encoding: [0x3c,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,1) ; encoding: [0x3c,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,1) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,1) ; encoding: [0x38,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,1) ; encoding: [0x38,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,1) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,1) ; encoding: [0x30,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,1) ; encoding: [0x30,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,0) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,0) ; encoding: [0x1e,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,2,0) ; encoding: [0x1e,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,3) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,3) ; encoding: [0x7c,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,4,3) ; encoding: [0x7c,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,7) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,7) ; encoding: [0xf8,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,8,7) ; encoding: [0xf8,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,15) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,15) ; encoding: [0xf0,0x01,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,16,15) ; encoding: [0xf0,0x01,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,31) +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,31) ; encoding: [0xe0,0x03,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,31) ; encoding: [0xe0,0x03,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "pppii") +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,4) ; encoding: [0x1f,0x0c,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(REVERSE,4) ; encoding: [0x1f,0x0c,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM, "01pip") +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"01pip") ; encoding: [0x07,0x09,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"01pip") ; encoding: [0x07,0x09,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x000 +// SICI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x001 +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"0000p") ; encoding: [0x01,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"0000p") ; encoding: [0x01,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x020 +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BROADCAST,32,1) ; encoding: [0x20,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x021 +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00001") ; encoding: [0x21,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00001") ; encoding: [0x21,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x400 +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00001") ; encoding: [0x00,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00001") ; encoding: [0x00,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x401 +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"0000i") ; encoding: [0x01,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"0000i") ; encoding: [0x01,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x420 +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00000") ; encoding: [0x20,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00000") ; encoding: [0x20,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08] + +ds_swizzle_b32 v8, v2 offset:0x421 +// SICI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00000") ; encoding: [0x21,0x04,0xd4,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 offset:swizzle(BITMASK_PERM,"00000") ; encoding: [0x21,0x04,0x7a,0xd8,0x02,0x00,0x00,0x08] diff --git a/test/MC/ARM/big-endian-thumb-fixup.s b/test/MC/ARM/big-endian-thumb-fixup.s index 5023fca26be10..4e81469fe489b 100644 --- a/test/MC/ARM/big-endian-thumb-fixup.s +++ b/test/MC/ARM/big-endian-thumb-fixup.s @@ -4,6 +4,7 @@ .text .align 2 .code 16 + .thumb_func @ARM::fixup_arm_thumb_bl .section s_thumb_bl,"ax",%progbits diff --git a/test/MC/ARM/mixed-arm-thumb-bl-fixup.ll b/test/MC/ARM/mixed-arm-thumb-bl-fixup.ll new file mode 100644 index 0000000000000..155ce5a425b45 --- /dev/null +++ b/test/MC/ARM/mixed-arm-thumb-bl-fixup.ll @@ -0,0 +1,77 @@ +; RUN: llc -O0 < %s -mtriple armv7-linux-gnueabi -o - \ +; RUN: | llvm-mc -triple armv7-linux-gnueabi -filetype=obj -o - \ +; RUN: | llvm-readobj -r | FileCheck --check-prefix LINUX %s + +; RUN: llc -O0 < %s -mtriple armv7-linux-android -o - \ +; RUN: | llvm-mc -triple armv7-linux-android -filetype=obj -o - \ +; RUN: | llvm-readobj -r | FileCheck --check-prefix LINUX %s + + +; RUN: llc -O0 < %s -mtriple armv7-apple-ios -o - \ +; RUN: | llvm-mc -triple armv7-apple-ios -filetype=obj -o - \ +; RUN: | llvm-readobj -r | FileCheck --check-prefix IOS %s + + +define void @thumb_caller() #0 { + call void @internal_arm_fn() + call void @global_arm_fn() + call void @internal_thumb_fn() + call void @global_thumb_fn() + ret void +} + +define void @arm_caller() #1 { + call void @internal_arm_fn() + call void @global_arm_fn() + call void @internal_thumb_fn() + call void @global_thumb_fn() + ret void +} + +define internal void @internal_thumb_fn() #0 { + ret void +} + +define void @global_thumb_fn() #0 { +entry: + br label %end +end: + br label %end + ret void +} + +define internal void @internal_arm_fn() #1 { + ret void +} + +define void @global_arm_fn() #1 { +entry: + br label %end +end: + br label %end + ret void +} + +attributes #0 = { "target-features"="+thumb-mode" } +attributes #1 = { "target-features"="-thumb-mode" } + +; LINUX: Section (3) .rel.text { +; LINUX-NEXT: 0x2 R_ARM_THM_CALL internal_arm_fn 0x0 +; LINUX-NEXT: 0x6 R_ARM_THM_CALL global_arm_fn 0x0 +; LINUX-NEXT: 0xE R_ARM_THM_CALL global_thumb_fn 0x0 +; LINUX-NEXT: 0x1C R_ARM_CALL internal_arm_fn 0x0 +; LINUX-NEXT: 0x20 R_ARM_CALL global_arm_fn 0x0 +; LINUX-NEXT: 0x24 R_ARM_CALL internal_thumb_fn 0x0 +; LINUX-NEXT: 0x28 R_ARM_CALL global_thumb_fn 0x0 +; LINUX-NEXT: } + +; IOS: Section __text { +; IOS-NEXT: 0x2C 1 2 0 ARM_RELOC_BR24 0 __text +; IOS-NEXT: 0x28 1 2 0 ARM_RELOC_BR24 0 __text +; IOS-NEXT: 0x24 1 2 0 ARM_RELOC_BR24 0 __text +; IOS-NEXT: 0x20 1 2 0 ARM_RELOC_BR24 0 __text +; IOS-NEXT: 0x10 1 2 0 ARM_THUMB_RELOC_BR22 0 __text +; IOS-NEXT: 0xC 1 2 0 ARM_THUMB_RELOC_BR22 0 __text +; IOS-NEXT: 0x8 1 2 0 ARM_THUMB_RELOC_BR22 0 __text +; IOS-NEXT: 0x4 1 2 0 ARM_THUMB_RELOC_BR22 0 __text +; IOS-NEXT: } diff --git a/test/MC/Disassembler/AMDGPU/gfx8_dasm_all.txt b/test/MC/Disassembler/AMDGPU/gfx8_dasm_all.txt index 37725e960f927..5fe7a8cd06219 100644 --- a/test/MC/Disassembler/AMDGPU/gfx8_dasm_all.txt +++ b/test/MC/Disassembler/AMDGPU/gfx8_dasm_all.txt @@ -990,23 +990,23 @@ # CHECK: ds_read_u16 v5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x79,0xd8,0x01,0x00,0x00,0x05] 0xff,0xff,0x79,0xd8,0x01,0x00,0x00,0x05 -# CHECK: ds_swizzle_b32 v5, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0x05] -0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0x05 +# CHECK: ds_swizzle_b32 v5, v1 ; encoding: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05] +0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05 -# CHECK: ds_swizzle_b32 v255, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0xff] -0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0xff +# CHECK: ds_swizzle_b32 v255, v1 ; encoding: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0xff] +0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0xff -# CHECK: ds_swizzle_b32 v5, v255 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0xff,0x00,0x00,0x05] -0xff,0xff,0x7a,0xd8,0xff,0x00,0x00,0x05 +# CHECK: ds_swizzle_b32 v5, v255 ; encoding: [0x00,0x00,0x7a,0xd8,0xff,0x00,0x00,0x05] +0x00,0x00,0x7a,0xd8,0xff,0x00,0x00,0x05 # CHECK: ds_swizzle_b32 v5, v1 ; encoding: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05] 0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05 -# CHECK: ds_swizzle_b32 v5, v1 offset:4 ; encoding: [0x04,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05] -0x04,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05 +# CHECK: ds_swizzle_b32 v5, v1 ; encoding: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05] +0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05 -# CHECK: ds_swizzle_b32 v5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7b,0xd8,0x01,0x00,0x00,0x05] -0xff,0xff,0x7b,0xd8,0x01,0x00,0x00,0x05 +# CHECK: ds_swizzle_b32 v5, v1 gds ; encoding: [0x00,0x00,0x7b,0xd8,0x01,0x00,0x00,0x05] +0x00,0x00,0x7b,0xd8,0x01,0x00,0x00,0x05 # CHECK: ds_permute_b32 v5, v1, v2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xd8,0x01,0x02,0x00,0x05] 0xff,0xff,0x7c,0xd8,0x01,0x02,0x00,0x05 diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll index 0ec356392a2d4..c5d10a0a67e34 100644 --- a/test/Other/new-pm-defaults.ll +++ b/test/Other/new-pm-defaults.ll @@ -30,6 +30,8 @@ ; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> ; CHECK-O-NEXT: Starting llvm::Module pass manager run. ; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass +; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-O-NEXT: Starting llvm::Module pass manager run. ; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis ; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> @@ -53,7 +55,6 @@ ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. -; CHECK-O-NEXT: Running pass: PGOIndirectCallPromotion ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA ; CHECK-O-NEXT: Running analysis: GlobalsAA ; CHECK-O-NEXT: Running analysis: CallGraphAnalysis @@ -134,6 +135,10 @@ ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Finished CGSCC pass manager run. +; CHECK-O-NEXT: Finished llvm::Module pass manager run. +; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-O-NEXT: Starting llvm::Module pass manager run. +; CHECK-O-NEXT: Running pass: GlobalOptPass ; CHECK-O-NEXT: Running pass: EliminateAvailableExternallyPass ; CHECK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA @@ -163,6 +168,7 @@ ; CHECK-O-NEXT: Running pass: GlobalDCEPass ; CHECK-O-NEXT: Running pass: ConstantMergePass ; CHECK-O-NEXT: Finished llvm::Module pass manager run. +; CHECK-O-NEXT: Finished llvm::Module pass manager run. ; CHECK-O-NEXT: Running pass: PrintModulePass ; ; Make sure we get the IR back out without changes when we print the module. diff --git a/test/Other/new-pm-thinlto-defaults.ll b/test/Other/new-pm-thinlto-defaults.ll new file mode 100644 index 0000000000000..52f475b0397d0 --- /dev/null +++ b/test/Other/new-pm-thinlto-defaults.ll @@ -0,0 +1,221 @@ +; The IR below was crafted so as: +; 1) To have a loop, so we create a loop pass manager +; 2) To be "immutable" in the sense that no pass in the standard +; pipeline will modify it. +; Since no transformations take place, we don't expect any analyses +; to be invalidated. +; Any invalidation that shows up here is a bug, unless we started modifying +; the IR, in which case we need to make it immutable harder. +; +; Prelink pipelines: +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto-pre-link<O1>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-PRELINK-O,CHECK-PRELINK-O1 +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto-pre-link<O2>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-PRELINK-O,CHECK-PRELINK-O2 +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto-pre-link<O3>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-PRELINK-O,CHECK-PRELINK-O3 +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto-pre-link<Os>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-PRELINK-O,CHECK-PRELINK-Os +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto-pre-link<Oz>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-PRELINK-O,CHECK-PRELINK-Oz +; +; Postlink pipelines: +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto<O1>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-POSTLINK-O,CHECK-POSTLINK-O1 +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto<O2>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-POSTLINK-O,CHECK-POSTLINK-O2 +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto<O3>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-POSTLINK-O,CHECK-POSTLINK-O3 +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto<Os>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-POSTLINK-O,CHECK-POSTLINK-Os +; RUN: opt -disable-verify -debug-pass-manager \ +; RUN: -passes='thinlto<Oz>' -S %s 2>&1 \ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-POSTLINK-O,CHECK-POSTLINK-Oz +; +; CHECK-O: Starting llvm::Module pass manager run. +; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-O-NEXT: Starting llvm::Module pass manager run. +; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass +; CHECK-POSTLINK-O-NEXT: Running pass: PGOIndirectCallPromotion +; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-O-NEXT: Starting llvm::Module pass manager run. +; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass +; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis +; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> +; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy +; CHECK-O-NEXT: Starting llvm::Function pass manager run. +; CHECK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-O-NEXT: Running analysis: TargetIRAnalysis +; CHECK-O-NEXT: Running analysis: AssumptionAnalysis +; CHECK-O-NEXT: Running pass: SROA +; CHECK-O-NEXT: Running analysis: DominatorTreeAnalysis +; CHECK-O-NEXT: Running pass: EarlyCSEPass +; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis +; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass +; CHECK-O-NEXT: Finished llvm::Function pass manager run. +; CHECK-O-NEXT: Running pass: IPSCCPPass +; CHECK-O-NEXT: Running pass: GlobalOptPass +; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PromotePass> +; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass +; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> +; CHECK-O-NEXT: Starting llvm::Function pass manager run. +; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-O-NEXT: Finished llvm::Function pass manager run. +; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA +; CHECK-O-NEXT: Running analysis: GlobalsAA +; CHECK-O-NEXT: Running analysis: CallGraphAnalysis +; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis +; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis +; CHECK-O-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}LazyCallGraph{{.*}}> +; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy +; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis +; CHECK-O-NEXT: Starting CGSCC pass manager run. +; CHECK-O-NEXT: Running pass: InlinerPass +; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}> +; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass +; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy +; CHECK-O-NEXT: Running analysis: AAManager +; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass +; CHECK-O-NEXT: Running pass: CGSCCToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> +; CHECK-O-NEXT: Starting llvm::Function pass manager run. +; CHECK-O-NEXT: Running pass: SROA +; CHECK-O-NEXT: Running pass: EarlyCSEPass +; CHECK-O-NEXT: Running pass: SpeculativeExecutionPass +; CHECK-O-NEXT: Running pass: JumpThreadingPass +; CHECK-O-NEXT: Running analysis: LazyValueAnalysis +; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass +; CHECK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O1-NEXT: Running pass: LibCallsShrinkWrapPass +; CHECK-O2-NEXT: Running pass: LibCallsShrinkWrapPass +; CHECK-O3-NEXT: Running pass: LibCallsShrinkWrapPass +; CHECK-O-NEXT: Running pass: TailCallElimPass +; CHECK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-O-NEXT: Running pass: ReassociatePass +; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis +; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis +; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopStandardAnalysisResults{{.*}}> +; CHECK-O-NEXT: Running analysis: LoopAnalysis +; CHECK-O-NEXT: Running analysis: ScalarEvolutionAnalysis +; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy +; CHECK-O-NEXT: Starting Loop pass manager run. +; CHECK-O-NEXT: Running pass: LoopRotatePass +; CHECK-O-NEXT: Running pass: LICM +; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy +; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass +; CHECK-O-NEXT: Finished Loop pass manager run. +; CHECK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopStandardAnalysisResults{{.*}}> +; CHECK-O-NEXT: Starting Loop pass manager run. +; CHECK-O-NEXT: Running pass: IndVarSimplifyPass +; CHECK-O-NEXT: Running pass: LoopIdiomRecognizePass +; CHECK-O-NEXT: Running pass: LoopDeletionPass +; CHECK-O-NEXT: Running pass: LoopUnrollPass +; CHECK-O-NEXT: Finished Loop pass manager run. +; CHECK-Os-NEXT: Running pass: MergedLoadStoreMotionPass +; CHECK-Os-NEXT: Running pass: GVN +; CHECK-Os-NEXT: Running analysis: MemoryDependenceAnalysis +; CHECK-Oz-NEXT: Running pass: MergedLoadStoreMotionPass +; CHECK-Oz-NEXT: Running pass: GVN +; CHECK-Oz-NEXT: Running analysis: MemoryDependenceAnalysis +; CHECK-O2-NEXT: Running pass: MergedLoadStoreMotionPass +; CHECK-O2-NEXT: Running pass: GVN +; CHECK-O2-NEXT: Running analysis: MemoryDependenceAnalysis +; CHECK-O3-NEXT: Running pass: MergedLoadStoreMotionPass +; CHECK-O3-NEXT: Running pass: GVN +; CHECK-O3-NEXT: Running analysis: MemoryDependenceAnalysis +; CHECK-O-NEXT: Running pass: MemCpyOptPass +; CHECK-O1-NEXT: Running analysis: MemoryDependenceAnalysis +; CHECK-O-NEXT: Running pass: SCCPPass +; CHECK-O-NEXT: Running pass: BDCEPass +; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis +; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O-NEXT: Running pass: JumpThreadingPass +; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass +; CHECK-O-NEXT: Running pass: DSEPass +; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass{{.*}}> +; CHECK-O-NEXT: Running pass: ADCEPass +; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis +; CHECK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O-NEXT: Finished llvm::Function pass manager run. +; CHECK-O-NEXT: Finished CGSCC pass manager run. +; CHECK-O-NEXT: Finished llvm::Module pass manager run. +; CHECK-PRELINK-O-NEXT: Running pass: GlobalOptPass +; CHECK-PRELINK-O-NEXT: Running pass: NameAnonGlobalPass +; CHECK-POSTLINK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-POSTLINK-O-NEXT: Starting llvm::Module pass manager run. +; CHECK-POSTLINK-O-NEXT: Running pass: GlobalOptPass +; CHECK-POSTLINK-O-NEXT: Running pass: EliminateAvailableExternallyPass +; CHECK-POSTLINK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass +; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA +; CHECK-POSTLINK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> +; CHECK-POSTLINK-O-NEXT: Starting llvm::Function pass manager run. +; CHECK-POSTLINK-O-NEXT: Running pass: Float2IntPass +; CHECK-POSTLINK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopRotatePass +; CHECK-POSTLINK-O-NEXT: Running pass: LoopDistributePass +; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis +; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis +; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass +; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis +; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass +; CHECK-POSTLINK-O-NEXT: Running pass: SLPVectorizerPass +; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass +; CHECK-POSTLINK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopUnrollPass +; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass +; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis +; CHECK-POSTLINK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass +; CHECK-POSTLINK-O-NEXT: Running pass: AlignmentFromAssumptionsPass +; CHECK-POSTLINK-O-NEXT: Running pass: LoopSinkPass +; CHECK-POSTLINK-O-NEXT: Running pass: InstSimplifierPass +; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass +; CHECK-POSTLINK-O-NEXT: Finished llvm::Function pass manager run. +; CHECK-POSTLINK-O-NEXT: Running pass: GlobalDCEPass +; CHECK-POSTLINK-O-NEXT: Running pass: ConstantMergePass +; CHECK-POSTLINK-O-NEXT: Finished llvm::Module pass manager run. +; CHECK-O-NEXT: Finished llvm::Module pass manager run. +; CHECK-O-NEXT: Running pass: PrintModulePass + +; Make sure we get the IR back out without changes when we print the module. +; CHECK-O-LABEL: define void @foo(i32 %n) local_unnamed_addr { +; CHECK-O-NEXT: entry: +; CHECK-O-NEXT: br label %loop +; CHECK-O: loop: +; CHECK-O-NEXT: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK-O-NEXT: %iv.next = add i32 %iv, 1 +; CHECK-O-NEXT: tail call void @bar() +; CHECK-O-NEXT: %cmp = icmp eq i32 %iv, %n +; CHECK-O-NEXT: br i1 %cmp, label %exit, label %loop +; CHECK-O: exit: +; CHECK-O-NEXT: ret void +; CHECK-O-NEXT: } +; +; CHECK-O-NEXT: Finished llvm::Module pass manager run. + +declare void @bar() local_unnamed_addr + +define void @foo(i32 %n) local_unnamed_addr { +entry: + br label %loop +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %iv.next = add i32 %iv, 1 + tail call void @bar() + %cmp = icmp eq i32 %iv, %n + br i1 %cmp, label %exit, label %loop +exit: + ret void +} diff --git a/test/ThinLTO/X86/error-newpm.ll b/test/ThinLTO/X86/newpm-basic.ll index 9c2fd2c70d6dd..d357cbc85d005 100644 --- a/test/ThinLTO/X86/error-newpm.ll +++ b/test/ThinLTO/X86/newpm-basic.ll @@ -1,9 +1,7 @@ ; RUN: opt -module-summary %s -o %t1.bc -; RUN: not llvm-lto2 run %t1.bc -o %t.o \ +; RUN: llvm-lto2 run %t1.bc -o %t.o \ ; RUN: -r=%t1.bc,_tinkywinky,pxl \ -; RUN: -lto-use-new-pm 2>&1 | FileCheck %s - -; CHECK: ThinLTO not supported with the new PM yet! +; RUN: -lto-use-new-pm target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.11.0" diff --git a/test/Transforms/CodeExtractor/PartialInlineAlloca.ll b/test/Transforms/CodeExtractor/PartialInlineAlloca.ll new file mode 100644 index 0000000000000..48db0b61a31be --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineAlloca.ll @@ -0,0 +1,68 @@ + +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s + ; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s + +%"class.base" = type { %"struct.base"* } +%"struct.base" = type opaque + +@g = external local_unnamed_addr global i32, align 4 + +; Function Attrs: nounwind uwtable +define i32 @callee_sinkable_bitcast(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL:define{{.*}}@callee_sinkable_bitcast.{{[0-9]}} +; CHECK: alloca +; CHECK-NEXT: bitcast +; CHECK: call void @llvm.lifetime +bb: + %tmp = alloca %"class.base", align 4 + %tmp1 = bitcast %"class.base"* %tmp to i8* + %tmp2 = load i32, i32* @g, align 4, !tbaa !2 + %tmp3 = add nsw i32 %tmp2, 1 + %tmp4 = icmp slt i32 %arg, 0 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp1) #2 + %tmp11 = bitcast %"class.base"* %tmp to i32* + store i32 %tmp3, i32* %tmp11, align 4, !tbaa !2 + store i32 %tmp3, i32* @g, align 4, !tbaa !2 + call void @bar(i32* nonnull %tmp11) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp1) #2 + br label %bb6 + +bb6: ; preds = %bb5, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ] + ret i32 %tmp7 +} + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @bar(i32*) local_unnamed_addr #2 +declare void @bar2(i32*, i32*) local_unnamed_addr #1 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind uwtable +define i32 @caller(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 @callee_sinkable_bitcast(i32 %arg) + ret i32 %tmp +} + +attributes #0 = { nounwind uwtable} +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} + + diff --git a/test/Transforms/CodeExtractor/PartialInlineAlloca2.ll b/test/Transforms/CodeExtractor/PartialInlineAlloca2.ll new file mode 100644 index 0000000000000..4ca418389e5ef --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineAlloca2.ll @@ -0,0 +1,65 @@ +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s + +%"class.base" = type { %"struct.base"* } +%"struct.base" = type opaque + +@g = external local_unnamed_addr global i32, align 4 + +define i32 @callee_no_bitcast(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL:define{{.*}}@callee_no_bitcast.{{[0-9]}} +; CHECK: alloca +; CHECK: call void @llvm.lifetime +bb: + %tmp = alloca i8, align 4 + %tmp2 = load i32, i32* @g, align 4, !tbaa !2 + %tmp3 = add nsw i32 %tmp2, 1 + %tmp4 = icmp slt i32 %arg, 0 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp) #2 + store i32 %tmp3, i32* @g, align 4, !tbaa !2 + %tmp11 = bitcast i8 * %tmp to i32* + call void @bar(i32* nonnull %tmp11) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp) #2 + br label %bb6 + +bb6: ; preds = %bb5, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ] + ret i32 %tmp7 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @bar(i32*) local_unnamed_addr #2 +declare void @bar2(i32*, i32*) local_unnamed_addr #1 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind uwtable +define i32 @caller(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 @callee_no_bitcast(i32 %arg) + ret i32 %tmp +} + +attributes #0 = { nounwind uwtable} +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} + + + diff --git a/test/Transforms/CodeExtractor/PartialInlineAlloca4.ll b/test/Transforms/CodeExtractor/PartialInlineAlloca4.ll new file mode 100644 index 0000000000000..6bb38d44f466c --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineAlloca4.ll @@ -0,0 +1,67 @@ +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s + +%"class.base" = type { %"struct.base"* } +%"struct.base" = type opaque + +@g = external local_unnamed_addr global i32, align 4 + +define i32 @callee_unknown_use1(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL:define{{.*}}@callee_unknown_use1.{{[0-9]}} +; CHECK-NOT: alloca +; CHECK: call void @llvm.lifetime +bb: + %tmp = alloca i8, align 4 + %tmp2 = load i32, i32* @g, align 4, !tbaa !2 + %tmp3 = add nsw i32 %tmp2, 1 + %tmp4 = icmp slt i32 %arg, 0 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp) #2 + store i32 %tmp3, i32* @g, align 4, !tbaa !2 + %tmp11 = bitcast i8* %tmp to i32* + call void @bar(i32* nonnull %tmp11) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp) #2 + br label %bb6 + +bb6: ; preds = %bb5, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ] + %tmp1 = bitcast i8* %tmp to i32* + ret i32 %tmp7 +} + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @bar(i32*) local_unnamed_addr #2 +declare void @bar2(i32*, i32*) local_unnamed_addr #1 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind uwtable +define i32 @caller(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 @callee_unknown_use1(i32 %arg) + ret i32 %tmp +} + +attributes #0 = { nounwind uwtable} +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} + + + diff --git a/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll b/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll new file mode 100644 index 0000000000000..9c53496e1ceac --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineAlloca5.ll @@ -0,0 +1,67 @@ +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s + +%"class.base" = type { %"struct.base"* } +%"struct.base" = type opaque + +@g = external local_unnamed_addr global i32, align 4 + +define i32 @callee_unknown_use2(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL:define{{.*}}@callee_unknown_use2.{{[0-9]}} +; CHECK-NOT: alloca +; CHECK: call void @llvm.lifetime +bb: + %tmp = alloca i32, align 4 + %tmp1 = bitcast i32* %tmp to i8* + %tmp2 = load i32, i32* @g, align 4, !tbaa !2 + %tmp3 = add nsw i32 %tmp2, 1 + %tmp4 = icmp slt i32 %arg, 0 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %tmp1) #2 + store i32 %tmp3, i32* %tmp, align 4, !tbaa !2 + store i32 %tmp3, i32* @g, align 4, !tbaa !2 + call void @bar(i32* nonnull %tmp) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp1) #2 + br label %bb6 + +bb6: ; preds = %bb5, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ] + %tmp10 = bitcast i8* %tmp1 to i32* + ret i32 %tmp7 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @bar(i32*) local_unnamed_addr #2 +declare void @bar2(i32*, i32*) local_unnamed_addr #1 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: nounwind uwtable +define i32 @caller(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 @callee_unknown_use2(i32 %arg) + ret i32 %tmp +} + +attributes #0 = { nounwind uwtable} +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} + + + diff --git a/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll b/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll new file mode 100644 index 0000000000000..e8a4d1281a237 --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll @@ -0,0 +1,61 @@ +; RUN: opt -S -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s +; RUN: opt -S -passes=partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s +define i32 @test(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 (...) @bar() #1 + %tmp1 = icmp slt i32 %arg, 0 + br i1 %tmp1, label %bb6, label %bb2 + +bb2: ; preds = %bb + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + %tmp3 = tail call i32 (...) @bar() #1 + %tmp4 = icmp eq i32 %tmp3, 10 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb2 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + br label %bb6 + +bb6: ; preds = %bb5, %bb2, %bb + %tmp7 = phi i32 [ %tmp, %bb5 ], [ 0, %bb ], [ %tmp, %bb2 ] + ret i32 %tmp7 +} + +declare i32 @bar(...) local_unnamed_addr #1 + +declare void @foo(...) local_unnamed_addr #1 + +; Function Attrs: nounwind uwtable +define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL: @dummy_caller +; CHECK: codeRepl.i: +; CHECK: call void @test.1_bb2() +; CHECK-NOT: load +; CHECK br + +bb: + %tmp = tail call i32 @test(i32 %arg) + ret i32 %tmp +} + +; CHECK-LABEL: define internal void @test.1_bb2() +; CHECK: .exitStub: +; CHECK-NOT: store i32 %tmp7, i32* %tmp7.out +; CHECK: ret + + +attributes #0 = { nounwind uwtable } +attributes #1 = { nounwind uwtable } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} diff --git a/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll b/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll new file mode 100644 index 0000000000000..a48ff4b1b8f99 --- /dev/null +++ b/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll @@ -0,0 +1,62 @@ +; RUN: opt -S -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s +; RUN: opt -S -passes=partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis < %s | FileCheck %s + +define i32 @test(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = tail call i32 (...) @bar() #1 + %tmp1 = icmp slt i32 %arg, 0 + br i1 %tmp1, label %bb6, label %bb2 + +bb2: ; preds = %bb + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + %tmp3 = tail call i32 (...) @bar() #1 + %tmp4 = icmp eq i32 %tmp3, 10 + br i1 %tmp4, label %bb6, label %bb5 + +bb5: ; preds = %bb2 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + tail call void (...) @foo() #1 + br label %bb6 + +bb6: ; preds = %bb5, %bb2, %bb + %tmp7 = phi i32 [ 1, %bb5 ], [ 0, %bb ], [ 1, %bb2 ] + ret i32 %tmp7 +} + +; Function Attrs: nounwind uwtable +declare i32 @bar(...) local_unnamed_addr #0 + +; Function Attrs: nounwind uwtable +declare void @foo(...) local_unnamed_addr #0 + +; Function Attrs: nounwind uwtable +define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 { +; CHECK-LABEL: @dummy_caller +; CHECK: codeRepl.i: +; CHECK: call void @test.1_bb2() +; CHECK-NOT: load +; CHECK br +bb: + %tmp = tail call i32 @test(i32 %arg) + ret i32 %tmp +} + +; CHECK-LABEL: define internal void @test.1_bb2() +; CHECK: .exitStub: +; CHECK-NOT: store i32 %tmp7, i32* %tmp7.out +; CHECK: ret + +attributes #0 = { nounwind uwtable } +attributes #1 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 303574)"} diff --git a/test/Transforms/GVN/PRE/phi-translate-2.ll b/test/Transforms/GVN/PRE/phi-translate-2.ll deleted file mode 100644 index b2993657c7f53..0000000000000 --- a/test/Transforms/GVN/PRE/phi-translate-2.ll +++ /dev/null @@ -1,105 +0,0 @@ -; RUN: opt < %s -gvn -S | FileCheck %s -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -@a = common global [100 x i64] zeroinitializer, align 16 -@b = common global [100 x i64] zeroinitializer, align 16 -@g1 = common global i64 0, align 8 -@g2 = common global i64 0, align 8 -@g3 = common global i64 0, align 8 -declare i64 @goo(...) local_unnamed_addr #1 - -define void @test1(i64 %a, i64 %b, i64 %c, i64 %d) { -entry: - %mul = mul nsw i64 %b, %a - store i64 %mul, i64* @g1, align 8 - %t0 = load i64, i64* @g2, align 8 - %cmp = icmp sgt i64 %t0, 3 - br i1 %cmp, label %if.then, label %if.end - -if.then: ; preds = %entry - %mul2 = mul nsw i64 %d, %c - store i64 %mul2, i64* @g2, align 8 - br label %if.end - -; Check phi-translate works and mul is removed. -; CHECK-LABEL: @test1( -; CHECK: if.end: -; CHECK: %[[MULPHI:.*]] = phi i64 [ {{.*}}, %if.then ], [ %mul, %entry ] -; CHECK-NOT: = mul -; CHECK: store i64 %[[MULPHI]], i64* @g3, align 8 -if.end: ; preds = %if.then, %entry - %b.addr.0 = phi i64 [ %d, %if.then ], [ %b, %entry ] - %a.addr.0 = phi i64 [ %c, %if.then ], [ %a, %entry ] - %mul3 = mul nsw i64 %a.addr.0, %b.addr.0 - store i64 %mul3, i64* @g3, align 8 - ret void -} - -define void @test2(i64 %i) { -entry: - %arrayidx = getelementptr inbounds [100 x i64], [100 x i64]* @a, i64 0, i64 %i - %t0 = load i64, i64* %arrayidx, align 8 - %arrayidx1 = getelementptr inbounds [100 x i64], [100 x i64]* @b, i64 0, i64 %i - %t1 = load i64, i64* %arrayidx1, align 8 - %mul = mul nsw i64 %t1, %t0 - store i64 %mul, i64* @g1, align 8 - %cmp = icmp sgt i64 %mul, 3 - br i1 %cmp, label %if.then, label %if.end - -; Check phi-translate works for the phi generated by loadpre. A new mul will be -; inserted in if.then block. -; CHECK-LABEL: @test2( -; CHECK: if.then: -; CHECK: %[[MUL_THEN:.*]] = mul -; CHECK: br label %if.end -if.then: ; preds = %entry - %call = tail call i64 (...) @goo() #2 - store i64 %call, i64* @g2, align 8 - br label %if.end - -; CHECK: if.end: -; CHECK: %[[MULPHI:.*]] = phi i64 [ %[[MUL_THEN]], %if.then ], [ %mul, %entry ] -; CHECK-NOT: = mul -; CHECK: store i64 %[[MULPHI]], i64* @g3, align 8 -if.end: ; preds = %if.then, %entry - %i.addr.0 = phi i64 [ 3, %if.then ], [ %i, %entry ] - %arrayidx3 = getelementptr inbounds [100 x i64], [100 x i64]* @a, i64 0, i64 %i.addr.0 - %t2 = load i64, i64* %arrayidx3, align 8 - %arrayidx4 = getelementptr inbounds [100 x i64], [100 x i64]* @b, i64 0, i64 %i.addr.0 - %t3 = load i64, i64* %arrayidx4, align 8 - %mul5 = mul nsw i64 %t3, %t2 - store i64 %mul5, i64* @g3, align 8 - ret void -} - -; Check phi-translate doesn't go through backedge, which may lead to incorrect -; pre transformation. -; CHECK: for.end: -; CHECK-NOT: %{{.*pre-phi}} = phi -; CHECK: ret void -define void @test3(i64 %N, i64* nocapture readonly %a) { -entry: - br label %for.cond - -for.cond: ; preds = %for.body, %entry - %i.0 = phi i64 [ 0, %entry ], [ %add, %for.body ] - %add = add nuw nsw i64 %i.0, 1 - %arrayidx = getelementptr inbounds i64, i64* %a, i64 %add - %tmp0 = load i64, i64* %arrayidx, align 8 - %cmp = icmp slt i64 %i.0, %N - br i1 %cmp, label %for.body, label %for.end - -for.body: ; preds = %for.cond - %call = tail call i64 (...) @goo() #2 - %add1 = sub nsw i64 0, %call - %tobool = icmp eq i64 %tmp0, %add1 - br i1 %tobool, label %for.cond, label %for.end - -for.end: ; preds = %for.body, %for.cond - %i.0.lcssa = phi i64 [ %i.0, %for.body ], [ %i.0, %for.cond ] - %arrayidx2 = getelementptr inbounds i64, i64* %a, i64 %i.0.lcssa - %tmp1 = load i64, i64* %arrayidx2, align 8 - store i64 %tmp1, i64* @g1, align 8 - ret void -} - diff --git a/test/Transforms/GVN/PRE/pre-gep-load.ll b/test/Transforms/GVN/PRE/pre-gep-load.ll index 1b2b4d20d31da..9eec8bb6455b4 100644 --- a/test/Transforms/GVN/PRE/pre-gep-load.ll +++ b/test/Transforms/GVN/PRE/pre-gep-load.ll @@ -37,7 +37,7 @@ sw.bb2: ; preds = %if.end, %entry %3 = load double, double* %arrayidx5, align 8 ; CHECK: sw.bb2: ; CHECK-NOT: sext -; CHECK: phi double [ +; CHECK-NEXT: phi double [ ; CHECK-NOT: load %sub6 = fsub double 3.000000e+00, %3 br label %return diff --git a/test/Transforms/GVN/PRE/pre-load.ll b/test/Transforms/GVN/PRE/pre-load.ll index ffff2b7f08e53..685df24f62b65 100644 --- a/test/Transforms/GVN/PRE/pre-load.ll +++ b/test/Transforms/GVN/PRE/pre-load.ll @@ -72,7 +72,7 @@ block4: %PRE = load i32, i32* %P3 ret i32 %PRE ; CHECK: block4: -; CHECK: phi i32 [ +; CHECK-NEXT: phi i32 [ ; CHECK-NOT: load ; CHECK: ret i32 } @@ -104,7 +104,7 @@ block4: %PRE = load i32, i32* %P3 ret i32 %PRE ; CHECK: block4: -; CHECK: phi i32 [ +; CHECK-NEXT: phi i32 [ ; CHECK-NOT: load ; CHECK: ret i32 } @@ -263,7 +263,7 @@ block4: %PRE = load i32, i32* %P3 ret i32 %PRE ; CHECK: block4: -; CHECK: phi i32 [ +; CHECK-NEXT: phi i32 [ ; CHECK-NOT: load ; CHECK: ret i32 } diff --git a/test/Transforms/Inline/AArch64/gep-cost.ll b/test/Transforms/Inline/AArch64/gep-cost.ll index 204958f082dd6..7d191d37f1fc7 100644 --- a/test/Transforms/Inline/AArch64/gep-cost.ll +++ b/test/Transforms/Inline/AArch64/gep-cost.ll @@ -4,11 +4,21 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" -define void @outer([4 x i32]* %ptr, i32 %i) { +define void @outer1([4 x i32]* %ptr, i32 %i) { call void @inner1([4 x i32]* %ptr, i32 %i) + ret void +} + +define void @outer2([4 x i32]* %ptr, i32 %i) { call void @inner2([4 x i32]* %ptr, i32 %i) ret void } + +define void @outer3([4 x i32]* %ptr, i32 %j) { + call void @inner3([4 x i32]* %ptr, i32 0, i32 %j) + ret void +} + ; The gep in inner1() is reg+reg, which is a legal addressing mode for AArch64. ; Thus, both the gep and ret can be simplified. ; CHECK: Analyzing call of inner1 @@ -19,7 +29,7 @@ define void @inner1([4 x i32]* %ptr, i32 %i) { ret void } -; The gep in inner2() is reg+imm+reg, which is not a legal addressing mode for +; The gep in inner2() is reg+imm+reg, which is not a legal addressing mode for ; AArch64. Thus, only the ret can be simplified and not the gep. ; CHECK: Analyzing call of inner2 ; CHECK: NumInstructionsSimplified: 1 @@ -28,3 +38,14 @@ define void @inner2([4 x i32]* %ptr, i32 %i) { %G = getelementptr inbounds [4 x i32], [4 x i32]* %ptr, i32 1, i32 %i ret void } + +; The gep in inner3() is reg+reg because %i is a known constant from the +; callsite. This case is a legal addressing mode for AArch64. Thus, both the +; gep and ret can be simplified. +; CHECK: Analyzing call of inner3 +; CHECK: NumInstructionsSimplified: 2 +; CHECK: NumInstructions: 2 +define void @inner3([4 x i32]* %ptr, i32 %i, i32 %j) { + %G = getelementptr inbounds [4 x i32], [4 x i32]* %ptr, i32 %i, i32 %j + ret void +} diff --git a/test/Transforms/InstCombine/ctpop.ll b/test/Transforms/InstCombine/ctpop.ll index 6bc6f9731979b..d49a907ffce1d 100644 --- a/test/Transforms/InstCombine/ctpop.ll +++ b/test/Transforms/InstCombine/ctpop.ll @@ -52,3 +52,19 @@ define i1 @test4(i8 %arg) { %res = icmp eq i8 %cnt, 2 ret i1 %res } + +; Test when the number of possible known bits isn't one less than a power of 2 +; and the compare value is greater but less than the next power of 2. +; TODO: The icmp is unnecessary given the known bits of the input. +define i1 @test5(i32 %arg) { +; CHECK-LABEL: @test5( +; CHECK-NEXT: [[AND:%.*]] = and i32 [[ARG:%.*]], 3 +; CHECK-NEXT: [[CNT:%.*]] = call i32 @llvm.ctpop.i32(i32 [[AND]]) +; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[CNT]], 3 +; CHECK-NEXT: ret i1 [[RES]] +; + %and = and i32 %arg, 3 + %cnt = call i32 @llvm.ctpop.i32(i32 %and) + %res = icmp eq i32 %cnt, 3 + ret i1 %res +} diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll index 5654b265da586..78c98955353e5 100644 --- a/test/Transforms/InstCombine/intrinsics.ll +++ b/test/Transforms/InstCombine/intrinsics.ll @@ -305,6 +305,20 @@ define i1 @cttz_knownbits2(i32 %arg) { ret i1 %res } +; TODO: The icmp is unnecessary given the known bits of the input. +define i1 @cttz_knownbits3(i32 %arg) { +; CHECK-LABEL: @cttz_knownbits3( +; CHECK-NEXT: [[OR:%.*]] = or i32 [[ARG:%.*]], 4 +; CHECK-NEXT: [[CNT:%.*]] = call i32 @llvm.cttz.i32(i32 [[OR]], i1 true) #2 +; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[CNT]], 3 +; CHECK-NEXT: ret i1 [[RES]] +; + %or = or i32 %arg, 4 + %cnt = call i32 @llvm.cttz.i32(i32 %or, i1 true) nounwind readnone + %res = icmp eq i32 %cnt, 3 + ret i1 %res +} + define i8 @ctlz(i8 %a) { ; CHECK-LABEL: @ctlz( ; CHECK-NEXT: ret i8 2 @@ -338,6 +352,20 @@ define i1 @ctlz_knownbits2(i8 %arg) { ret i1 %res } +; TODO: The icmp is unnecessary given the known bits of the input. +define i1 @ctlz_knownbits3(i8 %arg) { +; CHECK-LABEL: @ctlz_knownbits3( +; CHECK-NEXT: [[OR:%.*]] = or i8 [[ARG:%.*]], 32 +; CHECK-NEXT: [[CNT:%.*]] = call i8 @llvm.ctlz.i8(i8 [[OR]], i1 true) #2 +; CHECK-NEXT: [[RES:%.*]] = icmp eq i8 [[CNT]], 3 +; CHECK-NEXT: ret i1 [[RES]] +; + %or = or i8 %arg, 32 + %cnt = call i8 @llvm.ctlz.i8(i8 %or, i1 true) nounwind readnone + %res = icmp eq i8 %cnt, 3 + ret i1 %res +} + define void @cmp.simplify(i32 %a, i32 %b, i1* %c) { %lz = tail call i32 @llvm.ctlz.i32(i32 %a, i1 false) nounwind readnone %lz.cmp = icmp eq i32 %lz, 32 diff --git a/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll new file mode 100644 index 0000000000000..247ea35ff5d0a --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll @@ -0,0 +1,49 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -S -debug-only=loop-vectorize 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: all_scalar +; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions +; +define void @all_scalar(i64* %a, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr i64, i64* %a, i64 %i + store i64 0, i64* %tmp0, align 1 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +} + +; CHECK-LABEL: PR33193 +; CHECK: LV: Found scalar instruction: %i.next = zext i32 %j.next to i64 +; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64 +; CHECK: LV: Not considering vector loop of width 8 because it will not generate any vector instructions +%struct.a = type { i32, i8 } +define void @PR33193(%struct.a* %a, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %j = phi i32 [ 0, %entry ], [ %j.next, %for.body ] + %tmp0 = getelementptr inbounds %struct.a, %struct.a* %a, i64 %i, i32 1 + store i8 0, i8* %tmp0, align 4 + %j.next = add i32 %j, 1 + %i.next = zext i32 %j.next to i64 + %cond = icmp ugt i64 %n, %i.next + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll b/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll new file mode 100644 index 0000000000000..736ddc32856c0 --- /dev/null +++ b/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll @@ -0,0 +1,104 @@ +; RUN: opt -lower-expect -S -o - < %s | FileCheck %s +; RUN: opt -S -passes='function(lower-expect)' < %s | FileCheck %s + +define i32 @foo(i32 %arg) #0 { +; CHECK-LABEL: @foo(i32{{.*}}) +bb: + %tmp = sext i32 %arg to i64 + %tmp1 = call i64 @llvm.expect.i64(i64 %tmp, i64 4) + %tmp2 = icmp ne i64 %tmp1, 0 + br i1 %tmp2, label %bb3, label %bb5 +; CHECK: br i1 %tmp2{{.*}}!prof [[LIKELY:![0-9]+]] + +bb3: ; preds = %bb + %tmp4 = call i32 (...) @bar() + br label %bb5 + +bb5: ; preds = %bb3, %bb + ret i32 1 +} + +define i32 @foo2(i32 %arg) #0 { +; CHECK-LABEL: @foo2 +bb: + %tmp = sext i32 %arg to i64 + %tmp1 = call i64 @llvm.expect.i64(i64 %tmp, i64 4) + %tmp2 = icmp eq i64 %tmp1, 2 + br i1 %tmp2, label %bb3, label %bb5 +; CHECK: br i1 %tmp2{{.*}}!prof [[UNLIKELY:![0-9]+]] + +bb3: ; preds = %bb + %tmp4 = call i32 (...) @bar() + br label %bb5 + +bb5: ; preds = %bb3, %bb + ret i32 1 +} + +define i32 @foo3(i32 %arg) #0 { +; CHECK-LABEL: @foo3 +bb: + %tmp = sext i32 %arg to i64 + %tmp1 = call i64 @llvm.expect.i64(i64 %tmp, i64 4) + %tmp2 = icmp eq i64 %tmp1, 4 + br i1 %tmp2, label %bb3, label %bb5 +; CHECK: br i1 %tmp2{{.*}}!prof [[LIKELY]] + +bb3: ; preds = %bb + %tmp4 = call i32 (...) @bar() + br label %bb5 + +bb5: ; preds = %bb3, %bb + ret i32 1 +} + +define i32 @foo4(i32 %arg) #0 { +; CHECK-LABEL: @foo4 +bb: + %tmp = sext i32 %arg to i64 + %tmp1 = call i64 @llvm.expect.i64(i64 %tmp, i64 4) + %tmp2 = icmp ne i64 %tmp1, 2 + br i1 %tmp2, label %bb3, label %bb5 +; CHECK: br i1 %tmp2{{.*}}!prof [[LIKELY]] + +bb3: ; preds = %bb + %tmp4 = call i32 (...) @bar() + br label %bb5 + +bb5: ; preds = %bb3, %bb + ret i32 1 +} + +define i32 @foo5(i32 %arg, i32 %arg1) #0 { +; CHECK-LABEL: @foo5 +bb: + %tmp = sext i32 %arg1 to i64 + %tmp2 = call i64 @llvm.expect.i64(i64 %tmp, i64 4) + %tmp3 = sext i32 %arg to i64 + %tmp4 = icmp ne i64 %tmp2, %tmp3 + br i1 %tmp4, label %bb5, label %bb7 +; CHECK-NOT: !prof + +bb5: ; preds = %bb + %tmp6 = call i32 (...) @bar() + br label %bb7 + +bb7: ; preds = %bb5, %bb + ret i32 1 +} + +declare i64 @llvm.expect.i64(i64, i64) #1 + +declare i32 @bar(...) local_unnamed_addr #0 + +attributes #0 = { nounwind uwtable } +attributes #1 = { nounwind readnone } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 5.0.0 (trunk 304373)"} +; CHECK: [[LIKELY]] = !{!"branch_weights", i32 2000, i32 1} +; CHECK: [[UNLIKELY]] = !{!"branch_weights", i32 1, i32 2000} + diff --git a/test/Transforms/NewGVN/completeness.ll b/test/Transforms/NewGVN/completeness.ll index bafe5f966d22a..2b28f12df9d12 100644 --- a/test/Transforms/NewGVN/completeness.ll +++ b/test/Transforms/NewGVN/completeness.ll @@ -389,6 +389,23 @@ bb6: ; preds = %bb6, %bb2 ;; Ensure that we revisit predicateinfo operands at the right points in time. define void @test10() { +; CHECK-LABEL: @test10( +; CHECK-NEXT: b: +; CHECK-NEXT: br label [[G:%.*]] +; CHECK: g: +; CHECK-NEXT: [[N:%.*]] = phi i32* [ [[H:%.*]], [[I:%.*]] ], [ null, [[B:%.*]] ] +; CHECK-NEXT: [[H]] = getelementptr i32, i32* [[N]], i64 1 +; CHECK-NEXT: [[J:%.*]] = icmp eq i32* [[H]], getelementptr (i32, i32* null, i64 8) +; CHECK-NEXT: br i1 [[J]], label [[C:%.*]], label [[I]] +; CHECK: i: +; CHECK-NEXT: br i1 undef, label [[K:%.*]], label [[G]] +; CHECK: k: +; CHECK-NEXT: br i1 false, label [[C]], label [[O:%.*]] +; CHECK: o: +; CHECK-NEXT: br label [[C]] +; CHECK: c: +; CHECK-NEXT: ret void +; b: %m = getelementptr i32, i32* null, i64 8 br label %g diff --git a/test/Transforms/NewGVN/pr33185.ll b/test/Transforms/NewGVN/pr33185.ll new file mode 100644 index 0000000000000..c687d8fe51eba --- /dev/null +++ b/test/Transforms/NewGVN/pr33185.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -newgvn -S %s | FileCheck %s + +@a = local_unnamed_addr global i32 9, align 4 +@.str4 = private unnamed_addr constant [6 x i8] c"D:%d\0A\00", align 1 + +define i32 @main() local_unnamed_addr { +; CHECK-LABEL: @main( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* @a, align 4 +; CHECK-NEXT: [[CMP1_I:%.*]] = icmp ne i32 [[TMP]], 0 +; CHECK-NEXT: br label [[FOR_BODY_I:%.*]] +; CHECK: for.body.i: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[COND_END_I:%.*]] ] +; CHECK-NEXT: [[F_08_I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[COND_END_I]] ] +; CHECK-NEXT: [[MUL_I:%.*]] = select i1 [[CMP1_I]], i32 [[F_08_I]], i32 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[COND_END_I]], label [[COND_TRUE_I:%.*]] +; CHECK: cond.true.i: +; CHECK-NEXT: [[DIV_I:%.*]] = udiv i32 [[MUL_I]], [[F_08_I]] +; CHECK-NEXT: br label [[COND_END_I]] +; CHECK: cond.end.i: +; CHECK-NEXT: [[COND_I:%.*]] = phi i32 [ [[DIV_I]], [[COND_TRUE_I]] ], [ 0, [[FOR_BODY_I]] ] +; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[F_08_I]], 1 +; CHECK-NEXT: [[EXITCOND_I:%.*]] = icmp eq i32 [[INC_I]], 4 +; CHECK-NEXT: br i1 [[EXITCOND_I]], label [[FN1_EXIT:%.*]], label [[FOR_BODY_I]] +; CHECK: fn1.exit: +; CHECK-NEXT: [[CALL4:%.*]] = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str4, i64 0, i64 0), i32 [[COND_I]]) +; CHECK-NEXT: ret i32 0 +; +entry: + %tmp = load i32, i32* @a, align 4 + %cmp1.i = icmp ne i32 %tmp, 0 + br label %for.body.i + +for.body.i: + %tmp1 = phi i1 [ true, %entry ], [ false, %cond.end.i ] + %f.08.i = phi i32 [ 0, %entry ], [ %inc.i, %cond.end.i ] + %mul.i = select i1 %cmp1.i, i32 %f.08.i, i32 0 + br i1 %tmp1, label %cond.end.i, label %cond.true.i + +cond.true.i: + ;; Ensure we don't replace this divide with a phi of ops that merges the wrong loop iteration value + %div.i = udiv i32 %mul.i, %f.08.i + br label %cond.end.i + +cond.end.i: + %cond.i = phi i32 [ %div.i, %cond.true.i ], [ 0, %for.body.i ] + %inc.i = add nuw nsw i32 %f.08.i, 1 + %exitcond.i = icmp eq i32 %inc.i, 4 + br i1 %exitcond.i, label %fn1.exit, label %for.body.i + +fn1.exit: + %cond.i.lcssa = phi i32 [ %cond.i, %cond.end.i ] + %call4= tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str4, i64 0, i64 0), i32 %cond.i.lcssa) + ret i32 0 +} + +declare i32 @printf(i8* nocapture readonly, ...) + diff --git a/test/Transforms/PGOProfile/branch1.ll b/test/Transforms/PGOProfile/branch1.ll index 3db7566d50789..f675b1f1a0118 100644 --- a/test/Transforms/PGOProfile/branch1.ll +++ b/test/Transforms/PGOProfile/branch1.ll @@ -15,6 +15,9 @@ ; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE ; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.l.profdata -S | FileCheck %s --check-prefix=USE-LARGE +; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -pass-remarks-analysis=pgo-use-annot -pgo-emit-branch-prob -S 2>&1| FileCheck %s --check-prefix=ANALYSIS +; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -pass-remarks-analysis=pgo-use-annot -pgo-emit-branch-prob -S 2>&1| FileCheck %s --check-prefix=ANALYSIS + target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; GEN-DARWIN-LINKONCE: target triple = "x86_64-apple-darwin" @@ -54,3 +57,5 @@ if.end: ; USE-DAG: {{![0-9]+}} = !{i32 1, !"ProfileSummary", {{![0-9]+}}} ; USE-DAG: {{![0-9]+}} = !{!"DetailedSummary", {{![0-9]+}}} ; USE-DAG: ![[FUNC_ENTRY_COUNT]] = !{!"function_entry_count", i64 3} + +; ANALYSIS:remark: <unknown>:0:0: sgt_i32_Zero {{.*}}66.67% (total count : 3) diff --git a/test/Transforms/ThinLTOBitcodeWriter/new-pm.ll b/test/Transforms/ThinLTOBitcodeWriter/new-pm.ll new file mode 100644 index 0000000000000..03facd072b347 --- /dev/null +++ b/test/Transforms/ThinLTOBitcodeWriter/new-pm.ll @@ -0,0 +1,9 @@ +; RUN: opt -passes='no-op-module' -debug-pass-manager -thinlto-bc -thin-link-bitcode-file=%t2 -o %t %s 2>&1 | FileCheck %s --check-prefix=DEBUG_PM +; RUN: llvm-bcanalyzer -dump %t2 | FileCheck %s --check-prefix=BITCODE + +; DEBUG_PM: ThinLTOBitcodeWriterPass +; BITCODE: Foo + +define void @Foo() { + ret void +} diff --git a/test/Transforms/Util/PredicateInfo/condprop.ll b/test/Transforms/Util/PredicateInfo/condprop.ll index 79c76baa6f619..61f59f03e1bc2 100644 --- a/test/Transforms/Util/PredicateInfo/condprop.ll +++ b/test/Transforms/Util/PredicateInfo/condprop.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -print-predicateinfo -analyze < %s 2>&1 | FileCheck %s +; RUN: opt -print-predicateinfo -analyze -reverse-iterate < %s 2>&1 | FileCheck %s @a = external global i32 ; <i32*> [#uses=7] @@ -98,10 +99,10 @@ define void @test3(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]] -; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) ; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) -; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) ; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) +; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) +; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) ; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) ; CHECK-NEXT: br i1 [[Z]], label [[BOTH_ZERO:%.*]], label [[NOPE:%.*]] ; CHECK: both_zero: @@ -382,8 +383,8 @@ ret: define i32 @test10(i32 %j, i32 %i) { ; CHECK-LABEL: @test10( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]] -; CHECK: [[I_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[I]]) ; CHECK: [[J_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[J]]) +; CHECK: [[I_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[I]]) ; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]] ; CHECK: cond_true: ; CHECK-NEXT: [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]] diff --git a/test/Transforms/Util/PredicateInfo/testandor.ll b/test/Transforms/Util/PredicateInfo/testandor.ll index 5942ed155318c..43c508670908b 100644 --- a/test/Transforms/Util/PredicateInfo/testandor.ll +++ b/test/Transforms/Util/PredicateInfo/testandor.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -print-predicateinfo < %s 2>&1 | FileCheck %s +; RUN: opt -print-predicateinfo -reverse-iterate < %s 2>&1 | FileCheck %s declare void @foo(i1) declare void @bar(i32) @@ -10,10 +11,10 @@ define void @testor(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = or i1 [[XZ]], [[YZ]] -; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) ; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) -; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) ; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) +; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) +; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) ; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) ; CHECK-NEXT: br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]] ; CHECK: oneof: @@ -54,10 +55,10 @@ define void @testand(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]] -; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) ; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) -; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) ; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) +; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) +; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) ; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) ; CHECK-NEXT: br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]] ; CHECK: both: @@ -98,9 +99,9 @@ define void @testandsame(i32 %x, i32 %y) { ; CHECK-NEXT: [[XGT:%.*]] = icmp sgt i32 [[X:%.*]], 0 ; CHECK-NEXT: [[XLT:%.*]] = icmp slt i32 [[X]], 100 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XGT]], [[XLT]] -; CHECK: [[XGT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XGT]]) ; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) ; CHECK: [[X_0_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X_0]]) +; CHECK: [[XGT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XGT]]) ; CHECK: [[XLT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XLT]]) ; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) ; CHECK-NEXT: br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]] @@ -136,23 +137,23 @@ define void @testandassume(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]] -; CHECK: [[TMP1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) -; CHECK: [[TMP2:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) -; CHECK: [[TMP3:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) -; CHECK: [[TMP4:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) +; CHECK: [[TMP1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) +; CHECK: [[TMP2:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) +; CHECK: [[TMP3:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) +; CHECK: [[TMP4:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) ; CHECK: [[TMP5:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) ; CHECK-NEXT: call void @llvm.assume(i1 [[TMP5]]) -; CHECK: [[DOT0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP1]]) +; CHECK: [[DOT0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP1]]) ; CHECK: [[DOT01:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP2]]) ; CHECK: [[DOT02:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP3]]) -; CHECK: [[DOT03:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP4]]) +; CHECK: [[DOT03:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP4]]) ; CHECK: [[DOT04:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP5]]) ; CHECK-NEXT: br i1 [[TMP5]], label [[BOTH:%.*]], label [[NOPE:%.*]] ; CHECK: both: -; CHECK-NEXT: call void @foo(i1 [[DOT0]]) ; CHECK-NEXT: call void @foo(i1 [[DOT02]]) +; CHECK-NEXT: call void @foo(i1 [[DOT03]]) +; CHECK-NEXT: call void @bar(i32 [[DOT0]]) ; CHECK-NEXT: call void @bar(i32 [[DOT01]]) -; CHECK-NEXT: call void @bar(i32 [[DOT03]]) ; CHECK-NEXT: ret void ; CHECK: nope: ; CHECK-NEXT: call void @foo(i1 [[DOT04]]) diff --git a/test/tools/llvm-config/cflags.test b/test/tools/llvm-config/cflags.test index ef3e486bd968a..461de86b64c0b 100644 --- a/test/tools/llvm-config/cflags.test +++ b/test/tools/llvm-config/cflags.test @@ -4,4 +4,4 @@ RUN: llvm-config --cxxflags 2>&1 | FileCheck %s CHECK: -I CHECK: {{[/\\]}}include CHECK-NOT: error: -CHECK-NOT: warning +CHECK-NOT: warning: diff --git a/test/tools/llvm-cvtres/Inputs/test_resource.rc b/test/tools/llvm-cvtres/Inputs/test_resource.rc index fd616520dbe1b..5ca097baa0f73 100644 --- a/test/tools/llvm-cvtres/Inputs/test_resource.rc +++ b/test/tools/llvm-cvtres/Inputs/test_resource.rc @@ -42,3 +42,9 @@ LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS MENUITEM "salad", 101
MENUITEM "duck", 102
}
+
+
+myresource stringarray {
+ "this is a user defined resource\0",
+ "it contains many strings\0",
+}
\ No newline at end of file diff --git a/test/tools/llvm-cvtres/Inputs/test_resource.res b/test/tools/llvm-cvtres/Inputs/test_resource.res Binary files differindex c577ecc3d6333..d422bb4904da4 100644 --- a/test/tools/llvm-cvtres/Inputs/test_resource.res +++ b/test/tools/llvm-cvtres/Inputs/test_resource.res diff --git a/test/tools/llvm-cvtres/resource.test b/test/tools/llvm-cvtres/resource.test index 16970343c60dd..b9be74bf671b2 100644 --- a/test/tools/llvm-cvtres/resource.test +++ b/test/tools/llvm-cvtres/resource.test @@ -4,4 +4,48 @@ RUN: llvm-cvtres %p/Inputs/test_resource.res | FileCheck %s -CHECK: Number of resources: 7 +CHECK: Number of resources: 8 +CHECK-NEXT: Resource Tree [ +CHECK-NEXT: STRINGARRAY [ +CHECK-NEXT: MYRESOURCE [ +CHECK-NEXT: 1033 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: 2 [ +CHECK-NEXT: CURSOR [ +CHECK-NEXT: 1033 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: OKAY [ +CHECK-NEXT: 1033 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: 4 [ +CHECK-NEXT: "EAT" [ +CHECK-NEXT: 3081 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: 14432 [ +CHECK-NEXT: 2052 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: 5 [ +CHECK-NEXT: TESTDIALOG [ +CHECK-NEXT: 1033 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: 9 [ +CHECK-NEXT: MYACCELERATORS [ +CHECK-NEXT: 1033 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: 12 [ +CHECK-NEXT: 1033 [ +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: ] +CHECK-NEXT: ] |